diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 266 |
1 files changed, 169 insertions, 97 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 537803250ca9..941c1c0d5c6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map) +{ + unsigned int status; + int retval; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(handle, inode, map, 0); + else + retval = ext4_ind_map_blocks(handle, inode, map, 0); + + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1450,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve space for a single cluster + * Reserve space for 'nr_resv' clusters */ -static int ext4_da_reserve_space(struct inode *inode) +static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1463,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode) * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); - if (ext4_claim_free_clusters(sbi, 1, 0)) { + if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } - ei->i_reserved_data_blocks++; - trace_ext4_da_reserve_space(inode); + ei->i_reserved_data_blocks += nr_resv; + trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1621,24 +1650,58 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * ext4_insert_delayed_block - adds a delayed block to the extents status - * tree, incrementing the reserved cluster/block - * count or making a pending reservation - * where needed + * Check whether the cluster containing lblk has been allocated or has + * delalloc reservation. + * + * Returns 0 if the cluster doesn't have either, 1 if it has delalloc + * reservation, 2 if it's already been allocated, negative error code on + * failure. + */ +static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + + /* Has delalloc reservation? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) + return 1; + + /* Already been allocated? */ + if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) + return 2; + ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); + if (ret < 0) + return ret; + if (ret > 0) + return 2; + + return 0; +} + +/* + * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents + * status tree, incrementing the reserved + * cluster/block count or making pending + * reservations where needed * * @inode - file containing the newly added block - * @lblk - logical block to be added + * @lblk - start logical block to be added + * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ -static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; - bool allocated = false; + bool lclu_allocated = false; + bool end_allocated = false; + ext4_lblk_t resv_clu; + ext4_lblk_t end = lblk + len - 1; /* - * If the cluster containing lblk is shared with a delayed, + * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's @@ -1649,81 +1712,84 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { - ret = ext4_da_reserve_space(inode); + ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ - if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { - if (!ext4_es_scan_clu(inode, - &ext4_es_is_mapped, lblk)) { - ret = ext4_clu_mapped(inode, - EXT4_B2C(sbi, lblk)); - if (ret < 0) - return ret; - if (ret == 0) { - ret = ext4_da_reserve_space(inode); - if (ret != 0) /* ENOSPC */ - return ret; - } else { - allocated = true; - } - } else { - allocated = true; + resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; + + ret = ext4_clu_alloc_state(inode, lblk); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + lclu_allocated = (ret == 2); + } + + if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { + ret = ext4_clu_alloc_state(inode, end); + if (ret < 0) + return ret; + if (ret > 0) { + resv_clu--; + end_allocated = (ret == 2); } } + + if (resv_clu) { + ret = ext4_da_reserve_space(inode, resv_clu); + if (ret != 0) /* ENOSPC */ + return ret; + } } - ext4_es_insert_delayed_block(inode, lblk, allocated); + ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, + end_allocated); return 0; } /* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. + * Looks up the requested blocks and sets the delalloc extent map. + * First try to look up for the extent entry that contains the requested + * blocks in the extent status tree without i_data_sem, then try to look + * up for the ondisk extent mapping with i_data_sem in read mode, + * finally hold i_data_sem in write mode, looks up again and add a + * delalloc extent entry if it still couldn't find any extent. Pass out + * the mapped extent through @map and return 0 on success. */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) +static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; - sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + if (ext4_es_is_hole(&es)) goto add_delayed; +found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + map->m_flags |= EXT4_MAP_DELAYED; return 0; } - map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; - retval = es.es_len - (iblock - es.es_lblk); - if (retval > map->m_len) - retval = map->m_len; - map->m_len = retval; + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) @@ -1734,7 +1800,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif - return retval; + return 0; } /* @@ -1744,44 +1810,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; - else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - if (retval < 0) { - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - up_read(&EXT4_I(inode)->i_data_sem); - return retval; - } + retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); + if (retval) + return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); - retval = ext4_insert_delayed_block(inode, map->m_lblk); + /* + * Page fault path (ext4_page_mkwrite does not take i_rwsem) + * and fallocate path (no folio lock) can race. Make sure we + * lookup the extent status tree here again while i_data_sem + * is held in write mode, before inserting a new da entry in + * the extent status tree. + */ + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + map->m_len = min_t(unsigned int, map->m_len, + es.es_len - (map->m_lblk - es.es_lblk)); + + if (!ext4_es_is_hole(&es)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto found; + } + } else if (!ext4_has_inline_data(inode)) { + retval = ext4_map_query_blocks(NULL, inode, map); + if (retval) { + up_write(&EXT4_I(inode)->i_data_sem); + return retval < 0 ? retval : 0; + } + } + + map->m_flags |= EXT4_MAP_DELAYED; + retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); - if (retval) - return retval; - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); return retval; } @@ -1801,11 +1864,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; + sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + map.m_lblk = iblock; map.m_len = 1; @@ -1814,10 +1881,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) + ret = ext4_da_map_blocks(inode, &map); + if (ret < 0) return ret; + if (map.m_flags & EXT4_MAP_DELAYED) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); @@ -1865,7 +1939,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; @@ -2334,7 +2408,7 @@ static int mpage_journal_page_buffers(handle_t *handle, if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } @@ -2887,9 +2961,6 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); - #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else @@ -2948,6 +3019,11 @@ static int ext4_da_do_write_end(struct address_space *mapping, bool disksize_changed = false; loff_t new_i_size; + if (unlikely(!folio_buffers(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. @@ -3530,7 +3606,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3547,7 +3622,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3564,7 +3638,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3573,7 +3646,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, |