From 4e34323135ec0752af152ac588a4f96495074849 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Sun, 11 Aug 2019 16:27:41 -0400 Subject: ext4: fix warning when turn on dioread_nolock and inline_data mkfs.ext4 -O inline_data /dev/vdb mount -o dioread_nolock /dev/vdb /mnt echo "some inline data..." >> /mnt/test-file echo "some inline data..." >> /mnt/test-file sync The above script will trigger "WARN_ON(!io_end->handle && sbi->s_journal)" because ext4_should_dioread_nolock() returns false for a file with inline data. Move the check to a place after we have already removed the inline data and prepared inode to write normal pages. Reviewed-by: Jan Kara Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..a6523516d681 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2785,15 +2785,6 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; } - if (ext4_should_dioread_nolock(inode)) { - /* - * We may need to convert up to one extent per block in - * the page and we may dirty the inode. - */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); - } - /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so @@ -2812,6 +2803,15 @@ static int ext4_writepages(struct address_space *mapping, ext4_journal_stop(handle); } + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, + PAGE_SIZE >> inode->i_blkbits); + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; -- cgit From 991f52306ab8b5d8427f6f953162b5ab92e88e51 Mon Sep 17 00:00:00 2001 From: Shi Siyuan Date: Sun, 11 Aug 2019 16:28:41 -0400 Subject: ext4: remove unnecessary error check Remove unnecessary error check in ext4_file_write_iter(), because this check will be done in upcoming later function -- ext4_write_checks() -> generic_write_checks() Change-Id: I7b0ab27f693a50765c15b5eaa3f4e7c38f42e01e Signed-off-by: shisiyuan Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 70b0438dbc94..bc378a7ac950 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -230,8 +230,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif - if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) -- cgit From b0c013e2928d3696ceb6401311dbc1d7fcccd6dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:30:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE The new ioctl EXT4_IOC_CLEAR_ES_CACHE will force an inode's extent status cache to be cleared out. This is intended for use for debugging. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents_status.c | 28 ++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 1 + fs/ext4/ioctl.c | 9 +++++++++ 4 files changed, 40 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..b22f24f1d365 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -649,6 +649,8 @@ enum { #define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7521de2dcf3a..02cc8eb3eb0e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1374,6 +1374,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) return nr_shrunk; } +/* + * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove + * discretionary entries from the extent status cache. (Some entries + * must be present for proper operations.) + */ +void ext4_clear_inode_es(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct extent_status *es; + struct ext4_es_tree *tree; + struct rb_node *node; + + write_lock(&ei->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + tree->cache_es = NULL; + node = rb_first(&tree->root); + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(node); + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + } + } + ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); + write_unlock(&ei->i_es_lock); +} + #ifdef ES_DEBUG__ static void ext4_print_pending_tree(struct inode *inode) { diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 131a8b7df265..e16785f431e7 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -248,5 +248,6 @@ extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); +extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..15b1047878ab 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1115,6 +1115,14 @@ resizefs_out: case EXT4_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_CLEAR_ES_CACHE: + { + if (!inode_owner_or_capable(inode)) + return -EACCES; + ext4_clear_inode_es(inode); + return 0; + } + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1233,6 +1241,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: + case EXT4_IOC_CLEAR_ES_CACHE: break; default: return -ENOIOCTLCMD; -- cgit From 1ad3ea6e0a694b0486eb2cbe60378ad0fbf23642 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:31:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_GETSTATE The new ioctl EXT4_IOC_GETSTATE returns some of the dynamic state of an ext4 inode for debugging purposes. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++++ fs/ext4/ioctl.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b22f24f1d365..ee296797bcd2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -651,6 +651,7 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -664,6 +665,16 @@ enum { #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 15b1047878ab..ffb7bde4900d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1123,6 +1123,22 @@ resizefs_out: return 0; } + case EXT4_IOC_GETSTATE: + { + __u32 state = 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) + state |= EXT4_STATE_FLAG_EXT_PRECACHED; + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + state |= EXT4_STATE_FLAG_NEW; + if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + state |= EXT4_STATE_FLAG_NEWENTRY; + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) + state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; + + return put_user(state, (__u32 __user *) arg); + } + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1242,6 +1258,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: + case EXT4_IOC_GETSTATE: break; default: return -ENOIOCTLCMD; -- cgit From bb5835edcdf8bf78bbe51cff13e332c439bc0567 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:32:41 -0400 Subject: ext4: add new ioctl EXT4_IOC_GET_ES_CACHE For debugging reasons, it's useful to know the contents of the extent cache. Since the extent cache contains much of what is in the fiemap ioctl, use an fiemap-style interface to return this information. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 ++++++ fs/ext4/extents.c | 94 +++++++++++++++++++++++++++++++++++++++++++----- fs/ext4/extents_status.c | 10 ++++++ fs/ext4/extents_status.h | 1 + fs/ext4/inode.c | 6 ++-- fs/ext4/ioctl.c | 72 +++++++++++++++++++++++++++++++++++++ 6 files changed, 182 insertions(+), 11 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ee296797bcd2..e2d8ad27f4d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -652,6 +652,7 @@ enum { /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -692,6 +693,12 @@ enum { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -3258,6 +3265,9 @@ extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92266a2da7d6..0620d495fd8a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } +static int ext4_fill_es_cache_info(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + ext4_lblk_t next, end = block + num - 1; + struct extent_status es; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned int flags; + int err; + + while (block <= end) { + next = 0; + flags = 0; + if (!ext4_es_lookup_extent(inode, block, &next, &es)) + break; + if (ext4_es_is_unwritten(&es)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + if (ext4_es_is_delayed(&es)) + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ext4_es_is_hole(&es)) + flags |= EXT4_FIEMAP_EXTENT_HOLE; + if (next == 0) + flags |= FIEMAP_EXTENT_LAST; + if (flags & (FIEMAP_EXTENT_DELALLOC| + EXT4_FIEMAP_EXTENT_HOLE)) + es.es_pblk = 0; + else + es.es_pblk = ext4_es_pblock(&es); + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (next == 0) + break; + block = next; + if (err < 0) + return err; + if (err == 1) + return 0; + } + return 0; +} + + /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in @@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode, return next_del; } -/* fiemap flags we can handle specified here */ -#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) @@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) +static int _ext4_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, + int (*fill)(struct inode *, ext4_lblk_t, + ext4_lblk_t, + struct fiemap_extent_info *)) { ext4_lblk_t start_blk; + u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR; + int error = 0; if (ext4_has_inline_data(inode)) { @@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_ext_precache(inode); if (error) return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + fill == ext4_fill_fiemap_extents) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); - if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + if (fill == ext4_fill_es_cache_info) + ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; + if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { @@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information * and pushing extents back to the user. */ - error = ext4_fill_fiemap_extents(inode, start_blk, - len_blks, fieinfo); + error = fill(inode, start_blk, len_blks, fieinfo); } return error; } +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_fiemap_extents); +} + +int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + if (ext4_has_inline_data(inode)) { + int has_inline; + + down_read(&EXT4_I(inode)->xattr_sem); + has_inline = ext4_has_inline_data(inode); + up_read(&EXT4_I(inode)->xattr_sem); + if (has_inline) + return 0; + } + + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_es_cache_info); +} + + /* * ext4_access_path: * Function to access the path buffer for marking it dirty. diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 02cc8eb3eb0e..a959adc59bcd 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es) { struct ext4_es_tree *tree; @@ -948,6 +949,15 @@ out: if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; + if (next_lblk) { + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, + rb_node); + *next_lblk = es1->es_lblk; + } else + *next_lblk = 0; + } } else { stats->es_stats_cache_misses++; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e16785f431e7..eb56a1289031 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6523516d681..4b92c7603907 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -695,7 +695,7 @@ found: * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } @@ -1868,7 +1868,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, &es)) { + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ffb7bde4900d..d6242b7b8718 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } +/* copied from fs/ioctl.c */ +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + u64 maxbytes = (u64) sb->s_maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + if (fiemap.fm_extent_count != 0 && + !access_ok(fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1139,6 +1207,9 @@ resizefs_out: return put_user(state, (__u32 __user *) arg); } + case EXT4_IOC_GET_ES_CACHE: + return ext4_ioctl_get_es_cache(filp, arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1259,6 +1330,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: + case EXT4_IOC_GET_ES_CACHE: break; default: return -ENOIOCTLCMD; -- cgit From cd2d99229dc96219547e6349841e1aad851c6acc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 12 Aug 2019 13:44:49 -0400 Subject: ext4: drop legacy pre-1970 encoding workaround Originally, support for expanded timestamps had a bug in that pre-1970 times were erroneously encoded as being in the the 24th century. This was fixed in commit a4dad1ae24f8 ("ext4: Fix handling of extended tv_sec") which landed in 4.4. Starting with 4.4, pre-1970 timestamps were correctly encoded, but for backwards compatibility those incorrectly encoded timestamps were mapped back to the pre-1970 dates. Given that backwards compatibility workaround has been around for 4 years, and given that running e2fsck from e2fsprogs 1.43.2 and later will offer to fix these timestamps (which has been released for 3 years), it's past time to drop the legacy workaround from the kernel. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2d8ad27f4d1..17cc2dc13174 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,21 +828,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time) static inline void ext4_decode_extra_time(struct timespec64 *time, __le32 extra) { - if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { - -#if 1 - /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. (This backwards compatibility may be removed - * at the discretion of the ext4 developers.) - */ - u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; - if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) - extra_bits = 0; - time->tv_sec += extra_bits << 32; -#else + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; -#endif - } time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } -- cgit From 7a14826ede1d714f0bb56de8167c0e519041eeda Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Aug 2019 14:29:38 -0400 Subject: ext4: set error return correctly when ext4_htree_store_dirent fails Currently when the call to ext4_htree_store_dirent fails the error return variable 'ret' is is not being set to the error code and variable count is instead, hence the error code is not being returned. Fix this by assigning ret to the error return code. Addresses-Coverity: ("Unused value") Fixes: 8af0f0822797 ("ext4: fix readdir error in the case of inline_data+dir_index") Signed-off-by: Colin Ian King Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 88cdf3c90bd1..2fec62d764fa 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); if (err) { - count = err; + ret = err; goto out; } count++; -- cgit From 29b3692e6dbf82266ec3c2764c236f8708d7fc89 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 4 Aug 2019 19:35:48 -0700 Subject: ext4: wire up new fscrypt ioctls Wire up the new ioctls for adding and removing fscrypt keys to/from the filesystem, and the new ioctl for retrieving v2 encryption policies. The key removal ioctls also required making ext4_drop_inode() call fscrypt_drop_inode(). For more details see Documentation/filesystems/fscrypt.rst and the fscrypt patches that added the implementation of these ioctls. Reviewed-by: Theodore Ts'o Signed-off-by: Eric Biggers --- fs/ext4/ioctl.c | 30 ++++++++++++++++++++++++++++++ fs/ext4/super.c | 3 +++ 2 files changed, 33 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..fe5a4b13f939 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1115,6 +1115,31 @@ resizefs_out: case EXT4_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_add_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_remove_key_all_users(filp, + (void __user *)arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1231,6 +1256,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: break; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..757819139b8f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1107,6 +1107,9 @@ static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); + if (!drop) + drop = fscrypt_drop_inode(inode); + trace_ext4_drop_inode(inode, drop); return drop; } -- cgit From c93d8f88580921c84d2213161ef3c22560511b84 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:24 -0700 Subject: ext4: add basic fs-verity support Add most of fs-verity support to ext4. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. This commit adds all of ext4 fs-verity support except for the actual data verification, including: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by ext4 with only some relatively small changes to ext4. This approach avoids having to depend on the EA_INODE feature and on rearchitecturing ext4's xattr support to support paging multi-gigabyte xattrs into memory, and to support encrypting xattrs. Note that the verity metadata *must* be encrypted when the file is, since it contains hashes of the plaintext data. This patch incorporates work by Theodore Ts'o and Chandan Rajendra. Reviewed-by: Theodore Ts'o Signed-off-by: Eric Biggers --- fs/ext4/Makefile | 1 + fs/ext4/ext4.h | 21 +++- fs/ext4/file.c | 4 + fs/ext4/inode.c | 53 +++++--- fs/ext4/ioctl.c | 13 ++ fs/ext4/super.c | 9 ++ fs/ext4/sysfs.c | 6 + fs/ext4/verity.c | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 457 insertions(+), 17 deletions(-) create mode 100644 fs/ext4/verity.c (limited to 'fs/ext4') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8fdfcd3c3e04..b17ddc229ac5 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,3 +13,4 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..736972f46ea6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -41,6 +41,7 @@ #endif #include +#include #include @@ -395,6 +396,7 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ @@ -402,7 +404,7 @@ struct flex_groups { #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */ +#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ @@ -467,6 +469,7 @@ enum { EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ @@ -512,6 +515,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); @@ -1560,6 +1564,7 @@ enum { EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ + EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1610,6 +1615,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +static inline bool ext4_verity_in_progress(struct inode *inode) +{ + return IS_ENABLED(CONFIG_FS_VERITY) && + ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -1662,6 +1673,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 +#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1756,6 +1768,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) +EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -1813,7 +1826,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ - EXT4_FEATURE_RO_COMPAT_PROJECT) + EXT4_FEATURE_RO_COMPAT_PROJECT |\ + EXT4_FEATURE_RO_COMPAT_VERITY) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -3283,6 +3297,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +/* verity.c */ +extern const struct fsverity_operations ext4_verityops; + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 70b0438dbc94..b8a20bb9a145 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -457,6 +457,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret) return ret; + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; + /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..6de3d4ba28f3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1340,6 +1340,9 @@ retry_journal: } if (ret) { + bool extended = (pos + len > inode->i_size) && + !ext4_verity_in_progress(inode); + unlock_page(page); /* * __block_write_begin may have instantiated a few blocks @@ -1349,11 +1352,11 @@ retry_journal: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); - if (pos + len > inode->i_size) { + if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might @@ -1406,6 +1409,7 @@ static int ext4_write_end(struct file *file, int ret = 0, ret2; int i_size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (inline_data) { @@ -1423,12 +1427,16 @@ static int ext4_write_end(struct file *file, /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. + * + * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree + * blocks are being written past EOF, so skip the i_size update. */ - i_size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily @@ -1439,7 +1447,7 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ext4_mark_inode_dirty(handle, inode); - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1450,7 +1458,7 @@ errout: if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -1511,6 +1519,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; int size_changed = 0; int inline_data = ext4_has_inline_data(inode); + bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); @@ -1540,13 +1549,14 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - size_changed = ext4_update_inode_size(inode, pos + copied); + if (!verity) + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); put_page(page); - if (old_size < pos) + if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed || inline_data) { @@ -1555,7 +1565,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } - if (pos + len > inode->i_size && ext4_can_truncate(inode)) + if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1566,7 +1576,7 @@ errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - if (pos + len > inode->i_size) { + if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be @@ -2162,7 +2172,8 @@ static int ext4_writepage(struct page *page, trace_ext4_writepage(page); size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2246,7 +2257,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT) + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; @@ -2345,6 +2357,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; + if (ext4_verity_in_progress(inode)) + blocks = EXT_MAX_BLOCKS; + do { BUG_ON(buffer_locked(bh)); @@ -3061,8 +3076,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || - S_ISLNK(inode->i_mode)) { + if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || + ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -4739,6 +4754,8 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; + if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) + return false; return true; } @@ -4763,9 +4780,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; + if (flags & EXT4_VERITY_FL) + new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| - S_ENCRYPTED|S_CASEFOLD); + S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, @@ -5555,6 +5574,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + error = fsverity_prepare_setattr(dentry, attr); + if (error) + return error; + if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 442f7ef873fc..ce811df71690 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1171,6 +1171,17 @@ out: } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); + + case FS_IOC_ENABLE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_enable(filp, (const void __user *)arg); + + case FS_IOC_MEASURE_VERITY: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_measure(filp, (void __user *)arg); + default: return -ENOTTY; } @@ -1233,6 +1244,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..05a9874687c3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1179,6 +1179,7 @@ void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -4272,6 +4273,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &ext4_verityops; +#endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -4419,6 +4423,11 @@ no_journal: goto failed_mount_wq; } + if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); + goto failed_mount_wq; + } + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index b3cd7655a6ff..eb1efad0e20a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -242,6 +242,9 @@ EXT4_ATTR_FEATURE(encryption); #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); #endif +#ifdef CONFIG_FS_VERITY +EXT4_ATTR_FEATURE(verity); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { @@ -253,6 +256,9 @@ static struct attribute *ext4_feat_attrs[] = { #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), +#endif +#ifdef CONFIG_FS_VERITY + ATTR_LIST(verity), #endif ATTR_LIST(metadata_csum_seed), NULL, diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c new file mode 100644 index 000000000000..d0d8a9795dd6 --- /dev/null +++ b/fs/ext4/verity.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ext4/verity.c: fs-verity support for ext4 + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for ext4. + * + * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past + * the end of the file, starting at the first 64K boundary beyond i_size. This + * approach works because (a) verity files are readonly, and (b) pages fully + * beyond i_size aren't visible to userspace but can be read/written internally + * by ext4 with only some relatively small changes to ext4. This approach + * avoids having to depend on the EA_INODE feature and on rearchitecturing + * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and + * to support encrypting xattrs. Note that the verity metadata *must* be + * encrypted when the file is, since it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include + +#include "ext4.h" +#include "ext4_extents.h" +#include "ext4_jbd2.h" + +static inline loff_t ext4_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *addr; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + addr = kmap_atomic(page); + memcpy(buf, addr + offset_in_page(pos), n); + kunmap_atomic(addr); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + if (pos + count > inode->i_sb->s_maxbytes) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + void *fsdata; + void *addr; + int res; + + res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, + &page, &fsdata); + if (res) + return res; + + addr = kmap_atomic(page); + memcpy(addr + offset_in_page(pos), buf, n); + kunmap_atomic(addr); + + res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, + page, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +static int ext4_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_add() */ + handle_t *handle; + int err; + + if (ext4_verity_in_progress(inode)) + return -EBUSY; + + /* + * Since the file was opened readonly, we have to initialize the jbd + * inode and quotas here and not rely on ->open() doing it. This must + * be done before evicting the inline data. + */ + + err = ext4_inode_attach_jinode(inode); + if (err) + return err; + + err = dquot_initialize(inode); + if (err) + return err; + + err = ext4_convert_inline_data(inode); + if (err) + return err; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_warning_inode(inode, + "verity is only allowed on extent-based files"); + return -EOPNOTSUPP; + } + + /* + * ext4 uses the last allocated block to find the verity descriptor, so + * we must remove any other blocks past EOF which might confuse things. + */ + err = ext4_truncate(inode); + if (err) + return err; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err == 0) + ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + ext4_journal_stop(handle); + return err; +} + +/* + * ext4 stores the verity descriptor beginning on the next filesystem block + * boundary after the Merkle tree. Then, the descriptor size is stored in the + * last 4 bytes of the last allocated filesystem block --- which is either the + * block in which the descriptor ends, or the next block after that if there + * weren't at least 4 bytes remaining. + * + * We can't simply store the descriptor in an xattr because it *must* be + * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt + * xattrs. Also, if the descriptor includes a large signature blob it may be + * too large to store in an xattr without the EA_INODE feature. + */ +static int ext4_write_verity_descriptor(struct inode *inode, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) + + merkle_tree_size, i_blocksize(inode)); + const u64 desc_end = desc_pos + desc_size; + const __le32 desc_size_disk = cpu_to_le32(desc_size); + const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk), + i_blocksize(inode)) - + sizeof(desc_size_disk); + int err; + + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + return err; + + return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); +} + +static int ext4_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + const int credits = 2; /* superblock and inode for ext4_orphan_del() */ + handle_t *handle; + int err = 0; + int err2; + + if (desc != NULL) { + /* Succeeded; write the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + + /* Write all pages before clearing VERITY_IN_PROGRESS. */ + if (!err) + err = filemap_write_and_wait(inode->i_mapping); + } + + /* If we failed, truncate anything we wrote past i_size. */ + if (desc == NULL || err) + ext4_truncate(inode); + + /* + * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and + * deleting the inode from the orphan list, even if something failed. + * If everything succeeded, we'll also set the verity bit in the same + * transaction. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + err2 = ext4_orphan_del(handle, inode); + if (err2) + goto out_stop; + + if (desc != NULL && !err) { + struct ext4_iloc iloc; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } +out_stop: + ext4_journal_stop(handle); + return err ?: err2; +} + +static int ext4_get_verity_descriptor_location(struct inode *inode, + size_t *desc_size_ret, + u64 *desc_pos_ret) +{ + struct ext4_ext_path *path; + struct ext4_extent *last_extent; + u32 end_lblk; + u64 desc_size_pos; + __le32 desc_size_disk; + u32 desc_size; + u64 desc_pos; + int err; + + /* + * Descriptor size is in last 4 bytes of last allocated block. + * See ext4_write_verity_descriptor(). + */ + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + EXT4_ERROR_INODE(inode, "verity file doesn't use extents"); + return -EFSCORRUPTED; + } + + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + last_extent = path[path->p_depth].p_ext; + if (!last_extent) { + EXT4_ERROR_INODE(inode, "verity file has no extents"); + ext4_ext_drop_refs(path); + kfree(path); + return -EFSCORRUPTED; + } + + end_lblk = le32_to_cpu(last_extent->ee_block) + + ext4_ext_get_actual_len(last_extent); + desc_size_pos = (u64)end_lblk << inode->i_blkbits; + ext4_ext_drop_refs(path); + kfree(path); + + if (desc_size_pos < sizeof(desc_size_disk)) + goto bad; + desc_size_pos -= sizeof(desc_size_disk); + + err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk), + desc_size_pos); + if (err) + return err; + desc_size = le32_to_cpu(desc_size_disk); + + /* + * The descriptor is stored just before the desc_size_disk, but starting + * on a filesystem block boundary. + */ + + if (desc_size > INT_MAX || desc_size > desc_size_pos) + goto bad; + + desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode)); + if (desc_pos < ext4_verity_metadata_pos(inode)) + goto bad; + + *desc_size_ret = desc_size; + *desc_pos_ret = desc_pos; + return 0; + +bad: + EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor"); + return -EFSCORRUPTED; +} + +static int ext4_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + size_t desc_size = 0; + u64 desc_pos = 0; + int err; + + err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos); + if (err) + return err; + + if (buf_size) { + if (desc_size > buf_size) + return -ERANGE; + err = pagecache_read(inode, buf, desc_size, desc_pos); + if (err) + return err; + } + return desc_size; +} + +static struct page *ext4_read_merkle_tree_page(struct inode *inode, + pgoff_t index) +{ + index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; + + return read_mapping_page(inode->i_mapping, index, NULL); +} + +static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize); + + return pagecache_write(inode, buf, 1 << log_blocksize, pos); +} + +const struct fsverity_operations ext4_verityops = { + .begin_enable_verity = ext4_begin_enable_verity, + .end_enable_verity = ext4_end_enable_verity, + .get_verity_descriptor = ext4_get_verity_descriptor, + .read_merkle_tree_page = ext4_read_merkle_tree_page, + .write_merkle_tree_block = ext4_write_merkle_tree_block, +}; -- cgit From 22cfe4b48ccb5a3dbb92d6dcb88f396e0f400f74 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 Jul 2019 09:26:24 -0700 Subject: ext4: add fs-verity read support Make ext4_mpage_readpages() verify data as it is read from fs-verity files, using the helper functions from fs/verity/. To support both encryption and verity simultaneously, this required refactoring the decryption workflow into a generic "post-read processing" workflow which can do decryption, verification, or both. The case where the ext4 block size is not equal to the PAGE_SIZE is not supported yet, since in that case ext4_mpage_readpages() sometimes falls back to block_read_full_page(), which does not support fs-verity yet. Co-developed-by: Theodore Ts'o Signed-off-by: Theodore Ts'o Signed-off-by: Eric Biggers --- fs/ext4/ext4.h | 2 + fs/ext4/inode.c | 2 + fs/ext4/readpage.c | 211 ++++++++++++++++++++++++++++++++++++++++++++--------- fs/ext4/super.c | 9 ++- 4 files changed, 188 insertions(+), 36 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 736972f46ea6..9c7f4036021b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3191,6 +3191,8 @@ static inline void ext4_set_de_type(struct super_block *sb, extern int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead); +extern int __init ext4_init_post_read_processing(void); +extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6de3d4ba28f3..cf0fce1173a4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3912,6 +3912,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return 0; #endif + if (fsverity_active(inode)) + return 0; /* * If we are doing data journalling we don't support O_DIRECT diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c916017db334..a30b203fa461 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -47,13 +47,103 @@ #include "ext4.h" -static inline bool ext4_bio_encrypted(struct bio *bio) +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, + STEP_VERITY, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { -#ifdef CONFIG_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + struct page *page; + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_status || PageError(page)) { + ClearPageUptodate(page); + /* will re-read again later */ + ClearPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void verity_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fsverity_verify_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + /* + * We use different work queues for decryption and for verity because + * verity may require reading metadata pages that need decryption, and + * we shouldn't recurse to the same workqueue. + */ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + case STEP_VERITY: + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_status; } /* @@ -70,30 +160,53 @@ static inline bool ext4_bio_encrypted(struct bio *bio) */ static void mpage_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + if (bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; - if (ext4_bio_encrypted(bio)) { - if (bio->bi_status) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); - return; - } + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; + __read_end_io(bio); +} - if (!bio->bi_status) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); +static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, + struct bio *bio, + pgoff_t first_idx) +{ + unsigned int post_read_steps = 0; + struct bio_post_read_ctx *ctx = NULL; + + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) + post_read_steps |= 1 << STEP_DECRYPT; + + if (ext4_need_verity(inode, first_idx)) + post_read_steps |= 1 << STEP_VERITY; + + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; } + return ctx; +} - bio_put(bio); +static inline loff_t ext4_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && + (IS_VERITY(inode) || ext4_verity_in_progress(inode))) + return inode->i_sb->s_maxbytes; + + return i_size_read(inode); } int ext4_mpage_readpages(struct address_space *mapping, @@ -141,7 +254,8 @@ int ext4_mpage_readpages(struct address_space *mapping, block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = (ext4_readpage_limit(inode) + + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; @@ -218,6 +332,9 @@ int ext4_mpage_readpages(struct address_space *mapping, zero_user_segment(page, first_hole << blkbits, PAGE_SIZE); if (first_hole == 0) { + if (ext4_need_verity(inode, page->index) && + !fsverity_verify_page(page)) + goto set_error_page; SetPageUptodate(page); unlock_page(page); goto next_page; @@ -241,18 +358,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct fscrypt_ctx *ctx = NULL; + struct bio_post_read_ctx *ctx; - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(GFP_NOFS); - if (IS_ERR(ctx)) - goto set_error_page; - } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + if (!bio) + goto set_error_page; + ctx = get_bio_post_read_ctx(inode, bio, page->index); + if (IS_ERR(ctx)) { + bio_put(bio); + bio = NULL; goto set_error_page; } bio_set_dev(bio, bdev); @@ -293,3 +408,29 @@ int ext4_mpage_readpages(struct address_space *mapping, submit_bio(bio); return 0; } + +int __init ext4_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = + kmem_cache_create("ext4_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void ext4_exit_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 05a9874687c3..23e7acd43e4e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6103,6 +6103,10 @@ static int __init ext4_init_fs(void) return err; err = ext4_init_pending(); + if (err) + goto out7; + + err = ext4_init_post_read_processing(); if (err) goto out6; @@ -6144,8 +6148,10 @@ out3: out4: ext4_exit_pageio(); out5: - ext4_exit_pending(); + ext4_exit_post_read_processing(); out6: + ext4_exit_pending(); +out7: ext4_exit_es(); return err; @@ -6162,6 +6168,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); + ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } -- cgit From e3d550c2c4f2f3dba469bc3c4b83d9332b4e99e1 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Thu, 22 Aug 2019 22:53:46 -0400 Subject: ext4: fix warning inside ext4_convert_unwritten_extents_endio Really enable warning when CONFIG_EXT4_DEBUG is set and fix missing first argument. This was introduced in commit ff95ec22cd7f ("ext4: add warning to ext4_convert_unwritten_extents_endio") and splitting extents inside endio would trigger it. Fixes: ff95ec22cd7f ("ext4: add warning to ext4_convert_unwritten_extents_endio") Signed-off-by: Rakesh Pandit Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0620d495fd8a..fb0f99dc8c22 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3859,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { -#ifdef EXT4_DEBUG - ext4_warning("Inode (%ld) finished: extent logical block %llu," +#ifdef CONFIG_EXT4_DEBUG + ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); -- cgit From 7963e5ac901251c7a3b36fe7c987623a3f309393 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 22 Aug 2019 23:00:32 -0400 Subject: ext4: treat buffers with write errors as containing valid data I got some errors when I repair an ext4 volume which stacked by an iscsi target: Entry 'test60' in / (2) has deleted/unused inode 73750. Clear? It can be reproduced when the network not good enough. When I debug this I found ext4 will read entry buffer from disk and the buffer is marked with write_io_error. If the buffer is marked with write_io_error, it means it already wroten to journal, and not checked out to disk. IOW, the journal is newer than the data in disk. If this journal record 'delete test60', it means the 'test60' still on the disk metadata. In this case, if we read the buffer from disk successfully and create file continue, the new journal record will overwrite the journal which record 'delete test60', then the entry corruptioned. So, use the buffer rather than read from disk if the buffer is marked with write_io_error. Signed-off-by: Zhang Xiaoxu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 +++++++++++++ fs/ext4/inode.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17cc2dc13174..2348be3d66b7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3344,6 +3344,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b92c7603907..9db896fc6af8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; - if (!bh || buffer_uptodate(bh)) + if (!bh || ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); @@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ - if (bhs[i] && !buffer_uptodate(bhs[i])) + if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bhs[i]); -- cgit From 8fcc3a580651cceb94a9f48e1914491400d5146b Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 22 Aug 2019 23:22:14 -0400 Subject: ext4: rework reserved cluster accounting when invalidating pages The goal of this patch is to remove two references to the buffer delay bit in ext4_da_page_release_reservation() as part of a larger effort to remove all such references from ext4. These two references are principally used to reduce the reserved block/cluster count when pages are invalidated as a result of truncating, punching holes, or collapsing a block range in a file. The entire function is removed and replaced with code in ext4_es_remove_extent() that reduces the reserved count as a side effect of removing a block range from delayed and not unwritten extents in the extent status tree as is done when truncating, punching holes, or collapsing ranges. The code is written to minimize the number of searches descending from rb tree roots for scalability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 + fs/ext4/extents_status.c | 446 ++++++++++++++++++++++++++++++++++++----------- fs/ext4/extents_status.h | 2 - fs/ext4/inode.c | 63 +------ 4 files changed, 353 insertions(+), 161 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2348be3d66b7..0664c43cc9dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -284,6 +284,9 @@ struct ext4_io_submit { ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a959adc59bcd..5efbb116fba0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end); + ext4_lblk_t end, int *reserved); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); @@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, NULL); if (err != 0) goto error; retry: @@ -968,8 +968,322 @@ out: return found; } +struct rsvd_count { + int ndelonly; + bool first_do_lblk_found; + ext4_lblk_t first_do_lblk; + ext4_lblk_t last_do_lblk; + struct extent_status *left_es; + bool partial; + ext4_lblk_t lclu; +}; + +/* + * init_rsvd - initialize reserved count data before removing block range + * in file from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @es - pointer to first extent in range + * @rc - pointer to reserved count data + * + * Assumes es is not NULL + */ +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + + rc->ndelonly = 0; + + /* + * for bigalloc, note the first delonly block in the range has not + * been found, record the extent containing the block to the left of + * the region to be removed, if any, and note that there's no partial + * cluster to track + */ + if (sbi->s_cluster_ratio > 1) { + rc->first_do_lblk_found = false; + if (lblk > es->es_lblk) { + rc->left_es = es; + } else { + node = rb_prev(&es->rb_node); + rc->left_es = node ? rb_entry(node, + struct extent_status, + rb_node) : NULL; + } + rc->partial = false; + } +} + +/* + * count_rsvd - count the clusters containing delayed and not unwritten + * (delonly) blocks in a range within an extent and add to + * the running tally in rsvd_count + * + * @inode - file containing extent + * @lblk - first block in range + * @len - length of range in blocks + * @es - pointer to extent containing clusters to be counted + * @rc - pointer to reserved count data + * + * Tracks partial clusters found at the beginning and end of extents so + * they aren't overcounted when they span adjacent extents + */ +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t i, end, nclu; + + if (!ext4_es_is_delonly(es)) + return; + + WARN_ON(len <= 0); + + if (sbi->s_cluster_ratio == 1) { + rc->ndelonly += (int) len; + return; + } + + /* bigalloc */ + + i = (lblk < es->es_lblk) ? es->es_lblk : lblk; + end = lblk + (ext4_lblk_t) len - 1; + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; + + /* record the first block of the first delonly extent seen */ + if (rc->first_do_lblk_found == false) { + rc->first_do_lblk = i; + rc->first_do_lblk_found = true; + } + + /* update the last lblk in the region seen so far */ + rc->last_do_lblk = end; + + /* + * if we're tracking a partial cluster and the current extent + * doesn't start with it, count it and stop tracking + */ + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { + rc->ndelonly++; + rc->partial = false; + } + + /* + * if the first cluster doesn't start on a cluster boundary but + * ends on one, count it + */ + if (EXT4_LBLK_COFF(sbi, i) != 0) { + if (end >= EXT4_LBLK_CFILL(sbi, i)) { + rc->ndelonly++; + rc->partial = false; + i = EXT4_LBLK_CFILL(sbi, i) + 1; + } + } + + /* + * if the current cluster starts on a cluster boundary, count the + * number of whole delonly clusters in the extent + */ + if ((i + sbi->s_cluster_ratio - 1) <= end) { + nclu = (end - i + 1) >> sbi->s_cluster_bits; + rc->ndelonly += nclu; + i += nclu << sbi->s_cluster_bits; + } + + /* + * start tracking a partial cluster if there's a partial at the end + * of the current extent and we're not already tracking one + */ + if (!rc->partial && i <= end) { + rc->partial = true; + rc->lclu = EXT4_B2C(sbi, i); + } +} + +/* + * __pr_tree_search - search for a pending cluster reservation + * + * @root - root of pending reservation tree + * @lclu - logical cluster to search for + * + * Returns the pending reservation for the cluster identified by @lclu + * if found. If not, returns a reservation for the next cluster if any, + * and if not, returns NULL. + */ +static struct pending_reservation *__pr_tree_search(struct rb_root *root, + ext4_lblk_t lclu) +{ + struct rb_node *node = root->rb_node; + struct pending_reservation *pr = NULL; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else + return pr; + } + if (pr && lclu < pr->lclu) + return pr; + if (pr && lclu > pr->lclu) { + node = rb_next(&pr->rb_node); + return node ? rb_entry(node, struct pending_reservation, + rb_node) : NULL; + } + return NULL; +} + +/* + * get_rsvd - calculates and returns the number of cluster reservations to be + * released when removing a block range from the extent status tree + * and releases any pending reservations within the range + * + * @inode - file containing block range + * @end - last block in range + * @right_es - pointer to extent containing next block beyond end or NULL + * @rc - pointer to reserved count data + * + * The number of reservations to be released is equal to the number of + * clusters containing delayed and not unwritten (delonly) blocks within + * the range, minus the number of clusters still containing delonly blocks + * at the ends of the range, and minus the number of pending reservations + * within the range. + */ +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, + struct extent_status *right_es, + struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + bool left_delonly, right_delonly, count_pending; + struct extent_status *es; + + if (sbi->s_cluster_ratio > 1) { + /* count any remaining partial cluster */ + if (rc->partial) + rc->ndelonly++; + + if (rc->ndelonly == 0) + return 0; + + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); + + /* + * decrease the delonly count by the number of clusters at the + * ends of the range that still contain delonly blocks - + * these clusters still need to be reserved + */ + left_delonly = right_delonly = false; + + es = rc->left_es; + while (es && ext4_es_end(es) >= + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + left_delonly = true; + break; + } + node = rb_prev(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (end < ext4_es_end(right_es)) { + es = right_es; + } else { + node = rb_next(&right_es->rb_node); + es = node ? rb_entry(node, struct extent_status, + rb_node) : NULL; + } + while (es && es->es_lblk <= + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + right_delonly = true; + break; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, + rb_node); + } + } + + /* + * Determine the block range that should be searched for + * pending reservations, if any. Clusters on the ends of the + * original removed range containing delonly blocks are + * excluded. They've already been accounted for and it's not + * possible to determine if an associated pending reservation + * should be released with the information available in the + * extents status tree. + */ + if (first_lclu == last_lclu) { + if (left_delonly | right_delonly) + count_pending = false; + else + count_pending = true; + } else { + if (left_delonly) + first_lclu++; + if (right_delonly) + last_lclu--; + if (first_lclu <= last_lclu) + count_pending = true; + else + count_pending = false; + } + + /* + * a pending reservation found between first_lclu and last_lclu + * represents an allocated cluster that contained at least one + * delonly block, so the delonly total must be reduced by one + * for each pending reservation found and released + */ + if (count_pending) { + pr = __pr_tree_search(&tree->root, first_lclu); + while (pr && pr->lclu <= last_lclu) { + rc->ndelonly--; + node = rb_next(&pr->rb_node); + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + if (!node) + break; + pr = rb_entry(node, struct pending_reservation, + rb_node); + } + } + } + return rc->ndelonly; +} + + +/* + * __es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @end - last block in range + * @reserved - number of cluster reservations released + * + * If @reserved is not NULL and delayed allocation is enabled, counts + * block/cluster reservations freed by removing range and if bigalloc + * enabled cancels pending reservations as needed. Returns 0 on success, + * error code on failure. + */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end) + ext4_lblk_t end, int *reserved) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -978,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len1, len2; ext4_fsblk_t block; int err; + bool count_reserved = true; + struct rsvd_count rc; + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) + count_reserved = false; retry: err = 0; + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -989,6 +1308,8 @@ retry: /* Simply invalidate cache_es. */ tree->cache_es = NULL; + if (count_reserved) + init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; @@ -1030,10 +1351,16 @@ retry: ext4_es_store_pblock(es, block); } } + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, + &orig_es, &rc); goto out; } if (len1 > 0) { + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1, + &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); @@ -1042,6 +1369,8 @@ retry: } while (es && ext4_es_end(es) <= end) { + if (count_reserved) + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); @@ -1056,6 +1385,9 @@ retry: ext4_lblk_t orig_len = es->es_len; len1 = ext4_es_end(es) - end; + if (count_reserved) + count_rsvd(inode, es->es_lblk, orig_len - len1, + es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { @@ -1064,20 +1396,28 @@ retry: } } + if (count_reserved) + *reserved = get_rsvd(inode, end, es, &rc); out: return err; } /* - * ext4_es_remove_extent() removes a space from a extent status tree. + * ext4_es_remove_extent - removes block range from extent status tree * - * Return 0 on success, error code on failure. + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + * Reduces block/cluster reservation count and for bigalloc cancels pending + * reservations as needed. Returns 0 on success, error code on failure. */ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; + int reserved = 0; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", @@ -1095,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, &reserved); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); + ext4_da_release_space(inode, reserved); return err; } @@ -1327,6 +1668,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; + while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; @@ -1628,7 +1970,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk); + err = __es_remove_extent(inode, lblk, lblk, NULL); if (err != 0) goto error; retry: @@ -1817,93 +2159,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, __remove_pending(inode, last); } } - -/* - * ext4_es_remove_blks - remove block range from extents status tree and - * reduce reservation count or cancel pending - * reservation as needed - * - * @inode - file containing range - * @lblk - first block in range - * @len - number of blocks to remove - * - */ -void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int clu_size, reserved = 0; - ext4_lblk_t last_lclu, first, length, remainder, last; - bool delonly; - int err = 0; - struct pending_reservation *pr; - struct ext4_pending_tree *tree; - - /* - * Process cluster by cluster for bigalloc - there may be up to - * two clusters in a 4k page with a 1k block size and two blocks - * per cluster. Also necessary for systems with larger page sizes - * and potentially larger block sizes. - */ - clu_size = sbi->s_cluster_ratio; - last_lclu = EXT4_B2C(sbi, lblk + len - 1); - - write_lock(&EXT4_I(inode)->i_es_lock); - - for (first = lblk, remainder = len; - remainder > 0; - first += length, remainder -= length) { - - if (EXT4_B2C(sbi, first) == last_lclu) - length = remainder; - else - length = clu_size - EXT4_LBLK_COFF(sbi, first); - - /* - * The BH_Delay flag, which triggers calls to this function, - * and the contents of the extents status tree can be - * inconsistent due to writepages activity. So, note whether - * the blocks to be removed actually belong to an extent with - * delayed only status. - */ - delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); - - /* - * because of the writepages effect, written and unwritten - * blocks could be removed here - */ - last = first + length - 1; - err = __es_remove_extent(inode, first, last); - if (err) - ext4_warning(inode->i_sb, - "%s: couldn't remove page (err = %d)", - __func__, err); - - /* non-bigalloc case: simply count the cluster for release */ - if (sbi->s_cluster_ratio == 1 && delonly) { - reserved++; - continue; - } - - /* - * bigalloc case: if all delayed allocated only blocks have - * just been removed from a cluster, either cancel a pending - * reservation if it exists or count a cluster for release - */ - if (delonly && - !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { - pr = __get_pending(inode, EXT4_B2C(sbi, first)); - if (pr != NULL) { - tree = &EXT4_I(inode)->i_pending_tree; - rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); - } else { - reserved++; - } - } - } - - write_unlock(&EXT4_I(inode)->i_es_lock); - - ext4_da_release_space(inode, reserved); -} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index eb56a1289031..5e5c4a40d863 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -247,8 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9db896fc6af8..2b1c58da8d1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1646,49 +1646,6 @@ void ext4_da_release_space(struct inode *inode, int to_free) dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } -static void ext4_da_page_release_reservation(struct page *page, - unsigned int offset, - unsigned int length) -{ - int contiguous_blks = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - struct inode *inode = page->mapping->host; - unsigned int stop = offset + length; - ext4_fsblk_t lblk; - - BUG_ON(stop > PAGE_SIZE || stop < length); - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if (next_off > stop) - break; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - contiguous_blks++; - clear_buffer_delay(bh); - } else if (contiguous_blks) { - lblk = page->index << - (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - contiguous_blks = 0; - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - - if (contiguous_blks) { - lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - } - -} - /* * Delayed allocation stuff */ @@ -3227,24 +3184,6 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset, length); - -out: - ext4_invalidatepage(page, offset, length); - - return; -} - /* * Force all delayed allocation blocks to be allocated for a given inode. */ @@ -3985,7 +3924,7 @@ static const struct address_space_operations ext4_da_aops = { .write_end = ext4_da_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, + .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, -- cgit From c1e8220bd316d8ae8e524df39534b8a412a45d5e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 23 Aug 2019 22:38:00 -0400 Subject: ext4: fix punch hole for inline_data file systems If a program attempts to punch a hole on an inline data file, we need to convert it to a normal file first. This was detected using ext4/032 using the adv configuration. Simple reproducer: mke2fs -Fq -t ext4 -O inline_data /dev/vdc mount /vdc echo "" > /vdc/testfile xfs_io -c 'truncate 33554432' /vdc/testfile xfs_io -c 'fpunch 0 1048576' /vdc/testfile umount /vdc e2fsck -fy /dev/vdc Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2b1c58da8d1e..e567f0229d4e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4236,6 +4236,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + if (ext4_has_inline_data(inode)) { + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_convert_inline_data(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); + if (ret) + return ret; + } + /* * Write out all dirty pages to avoid race conditions * Then release them. -- cgit From 7727ae52975d4f4ef7ff69ed8e6e25f6a4168158 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:13:24 -0400 Subject: ext4: fix potential use after free after remounting with noblock_validity Remount process will release system zone which was allocated before if "noblock_validity" is specified. If we mount an ext4 file system to two mountpoints with default mount options, and then remount one of them with "noblock_validity", it may trigger a use after free problem when someone accessing the other one. # mount /dev/sda foo # mount /dev/sda bar User access mountpoint "foo" | Remount mountpoint "bar" | ext4_map_blocks() | ext4_remount() check_block_validity() | ext4_setup_system_zone() ext4_data_block_valid() | ext4_release_system_zone() | free system_blks rb nodes access system_blks rb nodes | trigger use after free | This problem can also be reproduced by one mountpint, At the same time, add_system_zone() can get called during remount as well so there can be racing ext4_data_block_valid() reading the rbtree at the same time. This patch add RCU to protect system zone from releasing or building when doing a remount which inverse current "noblock_validity" mount option. It assign the rbtree after the whole tree was complete and do actual freeing after rcu grace period, avoid any intermediate state. Reported-by: syzbot+1e470567330b7ad711d5@syzkaller.appspotmail.com Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/block_validity.c | 189 ++++++++++++++++++++++++++++++++++------------- fs/ext4/ext4.h | 10 ++- 2 files changed, 147 insertions(+), 52 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8e83741b02e0..d4d4fdfac1a6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void) void ext4_exit_system_zone(void) { + rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } @@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1, return 0; } +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ -static int add_system_zone(struct ext4_sb_info *sbi, +static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_zone *new_entry = NULL, *entry; - struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { @@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, new_node = &new_entry->node; rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); + rb_insert_color(new_node, &system_blks->root); } /* Can we merge to the left? */ @@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks); + node = rb_first(&sbi->system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, + struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + + if (system_blks == NULL) + return 1; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) if (n == 0) { i++; } else { - if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { + if (!ext4_data_block_valid_rcu(sbi, system_blks, + map.m_pblk, n)) { ext4_error(sb, "blocks %llu-%llu from inode %u " "overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; break; } - err = add_system_zone(sbi, map.m_pblk, n); + err = add_system_zone(system_blks, map.m_pblk, n); if (err < 0) break; i += n; @@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) return err; } +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int flex_size = ext4_flex_bg_size(sbi); int ret; if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks.rb_node) + if (sbi->system_blks) ext4_release_system_zone(sb); return 0; } - if (sbi->system_blks.rb_node) + if (sbi->system_blks) return 0; + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(sbi, ext4_group_first_block_no(sb, i), + add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), sbi->s_itb_per_group); if (ret) - return ret; + goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { - ret = ext4_protect_reserved_inode(sb, + ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) - return ret; + goto err; } + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_data_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->system_blks, system_blks); + if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; } -/* Called when the filesystem is unmounted */ +/* + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. + */ void ext4_release_system_zone(struct super_block *sb) { - struct ext4_system_zone *entry, *n; + struct ext4_system_blocks *system_blks; - rbtree_postorder_for_each_entry_safe(entry, n, - &EXT4_SB(sb)->system_blks, node) - kmem_cache_free(ext4_system_zone_cachep, entry); + system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL); - EXT4_SB(sb)->system_blks = RB_ROOT; + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *entry; - struct rb_node *n = sbi->system_blks.rb_node; + struct ext4_system_blocks *system_blks; + int ret; - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - } - return 1; + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, + count); + rcu_read_unlock(); + return ret; } int ext4_check_blockref(const char *function, unsigned int line, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0664c43cc9dc..c35bb8d734df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -184,6 +184,14 @@ struct ext4_map_blocks { unsigned int m_flags; }; +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + /* * Flags for ext4_io_end->flags */ @@ -1431,7 +1439,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; + struct ext4_system_blocks __rcu *system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ -- cgit From 520f897a3554b0665af1ae5d5ba286f290cecf5c Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Wed, 28 Aug 2019 11:19:23 -0400 Subject: ext4: use percpu_counters for extent_status cache hits/misses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit @es_stats_cache_hits and @es_stats_cache_misses are accessed frequently in ext4_es_lookup_extent function, it would influence the ext4 read/write performance in NUMA system. Let's optimize it using percpu_counter, it is profitable for the performance. The test command is as below: fio -name=randwrite -numjobs=8 -filename=/mnt/test1 -rw=randwrite -ioengine=libaio -direct=1 -iodepth=64 -sync=0 -norandommap -group_reporting -runtime=120 -time_based -bs=4k -size=5G And the result is better 10% than the initial implement: without the patch,IOPS=197k, BW=770MiB/s (808MB/s)(90.3GiB/120002msec) with the patch, IOPS=218k, BW=852MiB/s (894MB/s)(99.9GiB/120002msec) Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Eric Biggers Signed-off-by: Yang Guo Signed-off-by: Shaokun Zhang --- fs/ext4/extents_status.c | 37 ++++++++++++++++++++++++------------- fs/ext4/extents_status.h | 4 ++-- 2 files changed, 26 insertions(+), 15 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 5efbb116fba0..d996b44d2265 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -948,7 +948,7 @@ out: es->es_pblk = es1->es_pblk; if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); - stats->es_stats_cache_hits++; + percpu_counter_inc(&stats->es_stats_cache_hits); if (next_lblk) { node = rb_next(&es1->rb_node); if (node) { @@ -959,7 +959,7 @@ out: *next_lblk = 0; } } else { - stats->es_stats_cache_misses++; + percpu_counter_inc(&stats->es_stats_cache_misses); } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -1586,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); - seq_printf(seq, " %lu/%lu cache hits/misses\n", - es_stats->es_stats_cache_hits, - es_stats->es_stats_cache_misses); + seq_printf(seq, " %lld/%lld cache hits/misses\n", + percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), + percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); if (inode_cnt) seq_printf(seq, " %d inodes on list\n", inode_cnt); @@ -1615,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) sbi->s_es_nr_inode = 0; spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; - sbi->s_es_stats.es_stats_cache_hits = 0; - sbi->s_es_stats.es_stats_cache_misses = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, + GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, + GFP_KERNEL); + if (err) + goto err1; sbi->s_es_stats.es_stats_scan_time = 0; sbi->s_es_stats.es_stats_max_scan_time = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) - return err; + goto err2; err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) - goto err1; + goto err3; sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; err = register_shrinker(&sbi->s_es_shrinker); if (err) - goto err2; + goto err4; return 0; - -err2: +err4: percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); -err1: +err3: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 5e5c4a40d863..825313c59752 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -70,8 +70,8 @@ struct ext4_es_tree { struct ext4_es_stats { unsigned long es_stats_shrunk; - unsigned long es_stats_cache_hits; - unsigned long es_stats_cache_misses; + struct percpu_counter es_stats_cache_hits; + struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; -- cgit From 9ba55543fc0c6bb1cf8edd63be8802d9ab7e1202 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:25:01 -0400 Subject: ext4: fix integer overflow when calculating commit interval If user specify a large enough value of "commit=" option, it may trigger signed integer overflow which may lead to sbi->s_commit_interval becomes a large or small value, zero in particular. UBSAN: Undefined behaviour in ../fs/ext4/super.c:1592:31 signed integer overflow: 536870912 * 1000 cannot be represented in type 'int' [...] Call trace: [...] [] ubsan_epilogue+0x34/0x9c lib/ubsan.c:166 [] handle_overflow+0x228/0x280 lib/ubsan.c:197 [] __ubsan_handle_mul_overflow+0x4c/0x68 lib/ubsan.c:218 [] handle_mount_opt fs/ext4/super.c:1592 [inline] [] parse_options+0x1724/0x1a40 fs/ext4/super.c:1773 [] ext4_remount+0x2ec/0x14a0 fs/ext4/super.c:4834 [...] Although it is not a big deal, still silence the UBSAN by limit the input value. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..7310facffa9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1874,6 +1874,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } else if (token == Opt_commit) { if (arg == 0) arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + else if (arg > INT_MAX / HZ) { + ext4_msg(sb, KERN_ERR, + "Invalid commit interval %d, " + "must be smaller than %d", + arg, INT_MAX / HZ); + return -1; + } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { sbi->s_want_extra_isize = arg; -- cgit From 4881c4971df04107b37c29bb6c719ec29ceb6571 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sun, 21 Jan 2018 18:04:24 -0800 Subject: ext4: Initialize timestamps limits ext4 has different overflow limits for max filesystem timestamps based on the extra bytes available. The timestamp limits are calculated according to the encoding table in a4dad1ae24f85i(ext4: Fix handling of extended tv_sec): * extra msb of adjust for signed * epoch 32-bit 32-bit tv_sec to * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 Note that the time limits are not correct for deletion times. Added a warn when an inode cannot be extended to incorporate an extended timestamp. Signed-off-by: Deepa Dinamani Reviewed-by: Andreas Dilger Acked-by: Jeff Layton Cc: tytso@mit.edu Cc: adilger.kernel@dilger.ca Cc: linux-ext4@vger.kernel.org --- fs/ext4/ext4.h | 10 +++++++++- fs/ext4/super.c | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..9e3ae3be3de9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,11 +828,15 @@ static inline void ext4_decode_extra_time(struct timespec64 *time, #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ do { \ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ (raw_inode)->xtime ## _extra = \ ext4_encode_extra_time(&(inode)->xtime); \ } \ + else {\ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ + ext4_warning_inode(inode, "inode does not support timestamps beyond 2038"); \ + } \ } while (0) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -1632,6 +1636,10 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_GOOD_OLD_INODE_SIZE 128 +#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) +#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX +#define EXT4_TIMESTAMP_MIN S32_MIN + /* * Feature set definitions */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4079605d437a..3ea2d60f33aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4035,8 +4035,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inode_size); goto failed_mount; } - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) - sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + /* + * i_atime_extra is the last extra field available for [acm]times in + * struct ext4_inode. Checking for that field should suffice to ensure + * we have extra space for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + + sb->s_time_min = EXT4_TIMESTAMP_MIN; } sbi->s_desc_size = le16_to_cpu(es->s_desc_size); -- cgit From 0642ea2409f3bfa105570e12854b8e2628db6835 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 4 Aug 2019 17:56:43 +0800 Subject: ext4 crypto: fix to check feature status before get policy When getting fscrypt policy via EXT4_IOC_GET_ENCRYPTION_POLICY, if encryption feature is off, it's better to return EOPNOTSUPP instead of ENODATA, so let's add ext4_has_feature_encrypt() to do the check for that. This makes it so that all fscrypt ioctls consistently check for the encryption feature, and makes ext4 consistent with f2fs in this regard. Signed-off-by: Chao Yu [EB - removed unneeded braces, updated the documentation, and added more explanation to commit message] Signed-off-by: Eric Biggers --- Documentation/filesystems/fscrypt.rst | 3 ++- fs/ext4/ioctl.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 4289c29d7c5a..8a0700af9596 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -562,7 +562,8 @@ FS_IOC_GET_ENCRYPTION_POLICY_EX can fail with the following errors: or this kernel is too old to support FS_IOC_GET_ENCRYPTION_POLICY_EX (try FS_IOC_GET_ENCRYPTION_POLICY instead) - ``EOPNOTSUPP``: the kernel was not configured with encryption - support for this filesystem + support for this filesystem, or the filesystem superblock has not + had encryption enabled on it - ``EOVERFLOW``: the file is encrypted and uses a recognized encryption policy version, but the policy struct does not fit into the provided buffer diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index fe5a4b13f939..5703d607f5af 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1113,6 +1113,8 @@ resizefs_out: #endif } case EXT4_IOC_GET_ENCRYPTION_POLICY: + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_POLICY_EX: -- cgit From 6456ca6520ab6c9aec589b4640169cd6da378c68 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 3 Sep 2019 01:43:17 -0400 Subject: ext4: fix kernel oops caused by spurious casefold flag If an directory has the a casefold flag set without the casefold feature set, s_encoding will not be initialized, and this will cause the kernel to dereference a NULL pointer. In addition to adding checks to avoid these kernel oops, attempts to load inodes with the casefold flag when the casefold feature is not enable will cause the file system to be declared corrupted. Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 7 ++++--- fs/ext4/hash.c | 2 +- fs/ext4/inode.c | 3 +++ fs/ext4/namei.c | 4 ++-- 4 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 86054f31fe4d..9fdd2b269d61 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + struct inode *inode = dentry->d_parent->d_inode; - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); } - return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return ext4_ci_compare(inode, name, &qstr, false); } static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) @@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!IS_CASEFOLDED(dentry->d_inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index d358bfcb6b3f..3e133793a5a3 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir)) { + if (len && IS_CASEFOLDED(dir) && um) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e567f0229d4e..4e271b509af1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5067,6 +5067,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + ext4_error_inode(inode, function, line, 0, + "casefold flag without casefold feature"); brelse(iloc.bh); unlock_new_inode(inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 129029534075..a427d2031a8d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1312,7 +1312,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, { int len; - if (!IS_CASEFOLDED(dir)) { + if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) { cf_name->name = NULL; return; } @@ -2183,7 +2183,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #ifdef CONFIG_UNICODE if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && - utf8_validate(sbi->s_encoding, &dentry->d_name)) + sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; #endif -- cgit From cba465b4f9820b0d929822a70341dde14909fc18 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Wed, 4 Sep 2019 08:02:51 -0700 Subject: ext4: Reduce ext4 timestamp warnings When ext4 file systems were created intentionally with 128 byte inodes, the rate-limited warning of eventual possible timestamp overflow are still emitted rather frequently. Remove the warning for now. Discussion for whether any warning is needed, and where it should be emitted, can be found at https://lore.kernel.org/lkml/1567523922.5576.57.camel@lca.pw/. I can post a separate follow-up patch after the conclusion. Reported-by: Qian Cai Signed-off-by: Deepa Dinamani Reviewed-by: Andreas Dilger Signed-off-by: Arnd Bergmann --- fs/ext4/ext4.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9e3ae3be3de9..24b14bd3feab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -833,10 +833,8 @@ do { \ (raw_inode)->xtime ## _extra = \ ext4_encode_extra_time(&(inode)->xtime); \ } \ - else {\ + else \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ - ext4_warning_inode(inode, "inode does not support timestamps beyond 2038"); \ - } \ } while (0) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -- cgit From 72dbcf72156641fde4d8ea401e977341bfd35a05 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Sep 2019 12:32:03 -0700 Subject: Revert "ext4: make __ext4_get_inode_loc plug" This reverts commit b03755ad6f33b7b8cd7312a3596a2dbf496de6e7. This is sad, and done for all the wrong reasons. Because that commit is good, and does exactly what it says: avoids a lot of small disk requests for the inode table read-ahead. However, it turns out that it causes an entirely unrelated problem: the getrandom() system call was introduced back in 2014 by commit c6e9d6f38894 ("random: introduce getrandom(2) system call"), and people use it as a convenient source of good random numbers. But part of the current semantics for getrandom() is that it waits for the entropy pool to fill at least partially (unlike /dev/urandom). And at least ArchLinux apparently has a systemd that uses getrandom() at boot time, and the improvements in IO patterns means that existing installations suddenly start hanging, waiting for entropy that will never happen. It seems to be an unlucky combination of not _quite_ enough entropy, together with a particular systemd version and configuration. Lennart says that the systemd-random-seed process (which is what does this early access) is supposed to not block any other boot activity, but sadly that doesn't actually seem to be the case (possibly due bogus dependencies on cryptsetup for encrypted swapspace). The correct fix is to fix getrandom() to not block when it's not appropriate, but that fix is going to take a lot more discussion. Do we just make it act like /dev/urandom by default, and add a new flag for "wait for entropy"? Do we add a boot-time option? Or do we just limit the amount of time it will wait for entropy? So in the meantime, we do the revert to give us time to discuss the eventual fix for the fundamental problem, at which point we can re-apply the ext4 inode table access optimization. Reported-by: Ahmed S. Darwish Cc: Ted Ts'o Cc: Willy Tarreau Cc: Alexander E. Patrakov Cc: Lennart Poettering Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 420fe3deed39..006b7a2070bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,7 +4586,6 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; - struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4675,7 +4674,6 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ - blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4706,7 +4704,6 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); - blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, -- cgit From 02f03c4206c1b2a7451d3b3546f86c9c783eac13 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Sep 2019 17:59:23 -0700 Subject: Revert "Revert "ext4: make __ext4_get_inode_loc plug"" This reverts commit 72dbcf72156641fde4d8ea401e977341bfd35a05. Instead of waiting forever for entropy that may just not happen, we now try to actively generate entropy when required, and are thus hopefully avoiding the problem that caused the nice ext4 IO pattern fix to be reverted. So revert the revert. Cc: Ahmed S. Darwish Cc: Ted Ts'o Cc: Willy Tarreau Cc: Alexander E. Patrakov Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 006b7a2070bf..420fe3deed39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4586,6 +4586,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4674,6 +4675,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4704,6 +4706,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, -- cgit