diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 841 | 
1 files changed, 320 insertions, 521 deletions
| diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..ce5f21b6c2b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,  						   new_size);  } -static int __ext4_journalled_writepage(struct page *page, unsigned int len);  static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,  				  int pextents); @@ -180,33 +179,6 @@ void ext4_evict_inode(struct inode *inode)  	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)  		ext4_evict_ea_inode(inode);  	if (inode->i_nlink) { -		/* -		 * When journalling data dirty buffers are tracked only in the -		 * journal. So although mm thinks everything is clean and -		 * ready for reaping the inode might still have some pages to -		 * write in the running transaction or waiting to be -		 * checkpointed. Thus calling jbd2_journal_invalidate_folio() -		 * (via truncate_inode_pages()) to discard these buffers can -		 * cause data loss. Also even if we did not discard these -		 * buffers, we would have no way to find them after the inode -		 * is reaped and thus user could see stale data if he tries to -		 * read them before the transaction is checkpointed. So be -		 * careful and force everything to disk here... We use -		 * ei->i_datasync_tid to store the newest transaction -		 * containing inode's data. -		 * -		 * Note that directories do not have this problem because they -		 * don't use page cache. -		 */ -		if (inode->i_ino != EXT4_JOURNAL_INO && -		    ext4_should_journal_data(inode) && -		    S_ISREG(inode->i_mode) && inode->i_data.nrpages) { -			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; -			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - -			jbd2_complete_transaction(journal, commit_tid); -			filemap_write_and_wait(&inode->i_data); -		}  		truncate_inode_pages_final(&inode->i_data);  		goto no_delete; @@ -1005,29 +977,17 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,  }  /* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction.  We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write().  So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage().  In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page.  So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes.  If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated.  We'll still have enough credits for the tiny quotafile - * write. + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable.   */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ +	folio_mark_dirty(bh->b_folio); +	return ext4_handle_dirty_metadata(handle, NULL, bh); +} +  int do_journal_get_write_access(handle_t *handle, struct inode *inode,  				struct buffer_head *bh)  { @@ -1050,17 +1010,17 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,  	ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,  					    EXT4_JTR_NONE);  	if (!ret && dirty) -		ret = ext4_handle_dirty_metadata(handle, NULL, bh); +		ret = ext4_dirty_journalled_data(handle, bh);  	return ret;  }  #ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, +static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,  				  get_block_t *get_block)  {  	unsigned from = pos & (PAGE_SIZE - 1);  	unsigned to = from + len; -	struct inode *inode = page->mapping->host; +	struct inode *inode = folio->mapping->host;  	unsigned block_start, block_end;  	sector_t block;  	int err = 0; @@ -1070,22 +1030,24 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,  	int nr_wait = 0;  	int i; -	BUG_ON(!PageLocked(page)); +	BUG_ON(!folio_test_locked(folio));  	BUG_ON(from > PAGE_SIZE);  	BUG_ON(to > PAGE_SIZE);  	BUG_ON(from > to); -	if (!page_has_buffers(page)) -		create_empty_buffers(page, blocksize, 0); -	head = page_buffers(page); +	head = folio_buffers(folio); +	if (!head) { +		create_empty_buffers(&folio->page, blocksize, 0); +		head = folio_buffers(folio); +	}  	bbits = ilog2(blocksize); -	block = (sector_t)page->index << (PAGE_SHIFT - bbits); +	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);  	for (bh = head, block_start = 0; bh != head || !block_start;  	    block++, block_start = block_end, bh = bh->b_this_page) {  		block_end = block_start + blocksize;  		if (block_end <= from || block_start >= to) { -			if (PageUptodate(page)) { +			if (folio_test_uptodate(folio)) {  				set_buffer_uptodate(bh);  			}  			continue; @@ -1098,19 +1060,20 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,  			if (err)  				break;  			if (buffer_new(bh)) { -				if (PageUptodate(page)) { +				if (folio_test_uptodate(folio)) {  					clear_buffer_new(bh);  					set_buffer_uptodate(bh);  					mark_buffer_dirty(bh);  					continue;  				}  				if (block_end > to || block_start < from) -					zero_user_segments(page, to, block_end, -							   block_start, from); +					folio_zero_segments(folio, to, +							    block_end, +							    block_start, from);  				continue;  			}  		} -		if (PageUptodate(page)) { +		if (folio_test_uptodate(folio)) {  			set_buffer_uptodate(bh);  			continue;  		} @@ -1130,14 +1093,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,  			err = -EIO;  	}  	if (unlikely(err)) { -		page_zero_new_buffers(page, from, to); +		page_zero_new_buffers(&folio->page, from, to);  	} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {  		for (i = 0; i < nr_wait; i++) {  			int err2; -			err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), -								blocksize, -								bh_offset(wait[i])); +			err2 = fscrypt_decrypt_pagecache_blocks(folio, +						blocksize, bh_offset(wait[i]));  			if (err2) {  				clear_buffer_uptodate(wait[i]);  				err = err2; @@ -1149,6 +1111,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,  }  #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction.  We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end().  So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */  static int ext4_write_begin(struct file *file, struct address_space *mapping,  			    loff_t pos, unsigned len,  			    struct page **pagep, void **fsdata) @@ -1157,7 +1126,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,  	int ret, needed_blocks;  	handle_t *handle;  	int retries = 0; -	struct page *page; +	struct folio *folio;  	pgoff_t index;  	unsigned from, to; @@ -1184,68 +1153,68 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,  	}  	/* -	 * grab_cache_page_write_begin() can take a long time if the -	 * system is thrashing due to memory pressure, or if the page +	 * __filemap_get_folio() can take a long time if the +	 * system is thrashing due to memory pressure, or if the folio  	 * is being written back.  So grab it first before we start  	 * the transaction handle.  This also allows us to allocate -	 * the page (if needed) without using GFP_NOFS. +	 * the folio (if needed) without using GFP_NOFS.  	 */  retry_grab: -	page = grab_cache_page_write_begin(mapping, index); -	if (!page) -		return -ENOMEM; +	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, +					mapping_gfp_mask(mapping)); +	if (IS_ERR(folio)) +		return PTR_ERR(folio);  	/*  	 * The same as page allocation, we prealloc buffer heads before  	 * starting the handle.  	 */ -	if (!page_has_buffers(page)) -		create_empty_buffers(page, inode->i_sb->s_blocksize, 0); +	if (!folio_buffers(folio)) +		create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); -	unlock_page(page); +	folio_unlock(folio);  retry_journal:  	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);  	if (IS_ERR(handle)) { -		put_page(page); +		folio_put(folio);  		return PTR_ERR(handle);  	} -	lock_page(page); -	if (page->mapping != mapping) { -		/* The page got truncated from under us */ -		unlock_page(page); -		put_page(page); +	folio_lock(folio); +	if (folio->mapping != mapping) { +		/* The folio got truncated from under us */ +		folio_unlock(folio); +		folio_put(folio);  		ext4_journal_stop(handle);  		goto retry_grab;  	} -	/* In case writeback began while the page was unlocked */ -	wait_for_stable_page(page); +	/* In case writeback began while the folio was unlocked */ +	folio_wait_stable(folio);  #ifdef CONFIG_FS_ENCRYPTION  	if (ext4_should_dioread_nolock(inode)) -		ret = ext4_block_write_begin(page, pos, len, +		ret = ext4_block_write_begin(folio, pos, len,  					     ext4_get_block_unwritten);  	else -		ret = ext4_block_write_begin(page, pos, len, -					     ext4_get_block); +		ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);  #else  	if (ext4_should_dioread_nolock(inode)) -		ret = __block_write_begin(page, pos, len, +		ret = __block_write_begin(&folio->page, pos, len,  					  ext4_get_block_unwritten);  	else -		ret = __block_write_begin(page, pos, len, ext4_get_block); +		ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);  #endif  	if (!ret && ext4_should_journal_data(inode)) {  		ret = ext4_walk_page_buffers(handle, inode, -					     page_buffers(page), from, to, NULL, -					     do_journal_get_write_access); +					     folio_buffers(folio), from, to, +					     NULL, do_journal_get_write_access);  	}  	if (ret) {  		bool extended = (pos + len > inode->i_size) &&  				!ext4_verity_in_progress(inode); -		unlock_page(page); +		folio_unlock(folio);  		/*  		 * __block_write_begin may have instantiated a few blocks  		 * outside i_size.  Trim these off again. Don't need @@ -1273,10 +1242,10 @@ retry_journal:  		if (ret == -ENOSPC &&  		    ext4_should_retry_alloc(inode->i_sb, &retries))  			goto retry_journal; -		put_page(page); +		folio_put(folio);  		return ret;  	} -	*pagep = page; +	*pagep = &folio->page;  	return ret;  } @@ -1288,7 +1257,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,  	if (!buffer_mapped(bh) || buffer_freed(bh))  		return 0;  	set_buffer_uptodate(bh); -	ret = ext4_handle_dirty_metadata(handle, NULL, bh); +	ret = ext4_dirty_journalled_data(handle, bh);  	clear_buffer_meta(bh);  	clear_buffer_prio(bh);  	return ret; @@ -1306,6 +1275,7 @@ static int ext4_write_end(struct file *file,  			  loff_t pos, unsigned len, unsigned copied,  			  struct page *page, void *fsdata)  { +	struct folio *folio = page_folio(page);  	handle_t *handle = ext4_journal_current_handle();  	struct inode *inode = mapping->host;  	loff_t old_size = inode->i_size; @@ -1321,7 +1291,7 @@ static int ext4_write_end(struct file *file,  	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);  	/* -	 * it's important to update i_size while still holding page lock: +	 * it's important to update i_size while still holding folio lock:  	 * page writeout could otherwise come in and zero beyond i_size.  	 *  	 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree @@ -1329,15 +1299,15 @@ static int ext4_write_end(struct file *file,  	 */  	if (!verity)  		i_size_changed = ext4_update_inode_size(inode, pos + copied); -	unlock_page(page); -	put_page(page); +	folio_unlock(folio); +	folio_put(folio);  	if (old_size < pos && !verity)  		pagecache_isize_extended(inode, old_size, pos);  	/* -	 * Don't mark the inode dirty under page lock. First, it unnecessarily -	 * makes the holding time of page lock longer. Second, it forces lock -	 * ordering of page lock and transaction start for journaling +	 * Don't mark the inode dirty under folio lock. First, it unnecessarily +	 * makes the holding time of folio lock longer. Second, it forces lock +	 * ordering of folio lock and transaction start for journaling  	 * filesystems.  	 */  	if (i_size_changed) @@ -1371,28 +1341,28 @@ static int ext4_write_end(struct file *file,  /*   * This is a private version of page_zero_new_buffers() which doesn't   * set the buffer to be dirty, since in data=journalled mode we need - * to call ext4_handle_dirty_metadata() instead. + * to call ext4_dirty_journalled_data() instead.   */  static void ext4_journalled_zero_new_buffers(handle_t *handle,  					    struct inode *inode, -					    struct page *page, +					    struct folio *folio,  					    unsigned from, unsigned to)  {  	unsigned int block_start = 0, block_end;  	struct buffer_head *head, *bh; -	bh = head = page_buffers(page); +	bh = head = folio_buffers(folio);  	do {  		block_end = block_start + bh->b_size;  		if (buffer_new(bh)) {  			if (block_end > from && block_start < to) { -				if (!PageUptodate(page)) { +				if (!folio_test_uptodate(folio)) {  					unsigned start, size;  					start = max(from, block_start);  					size = min(to, block_end) - start; -					zero_user(page, start, size); +					folio_zero_range(folio, start, size);  					write_end_fn(handle, inode, bh);  				}  				clear_buffer_new(bh); @@ -1408,6 +1378,7 @@ static int ext4_journalled_write_end(struct file *file,  				     loff_t pos, unsigned len, unsigned copied,  				     struct page *page, void *fsdata)  { +	struct folio *folio = page_folio(page);  	handle_t *handle = ext4_journal_current_handle();  	struct inode *inode = mapping->host;  	loff_t old_size = inode->i_size; @@ -1426,25 +1397,26 @@ static int ext4_journalled_write_end(struct file *file,  	if (ext4_has_inline_data(inode))  		return ext4_write_inline_data_end(inode, pos, len, copied, page); -	if (unlikely(copied < len) && !PageUptodate(page)) { +	if (unlikely(copied < len) && !folio_test_uptodate(folio)) {  		copied = 0; -		ext4_journalled_zero_new_buffers(handle, inode, page, from, to); +		ext4_journalled_zero_new_buffers(handle, inode, folio, +						 from, to);  	} else {  		if (unlikely(copied < len)) -			ext4_journalled_zero_new_buffers(handle, inode, page, +			ext4_journalled_zero_new_buffers(handle, inode, folio,  							 from + copied, to); -		ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), +		ret = ext4_walk_page_buffers(handle, inode, +					     folio_buffers(folio),  					     from, from + copied, &partial,  					     write_end_fn);  		if (!partial) -			SetPageUptodate(page); +			folio_mark_uptodate(folio);  	}  	if (!verity)  		size_changed = ext4_update_inode_size(inode, pos + copied); -	ext4_set_inode_state(inode, EXT4_STATE_JDATA);  	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; -	unlock_page(page); -	put_page(page); +	folio_unlock(folio); +	folio_put(folio);  	if (old_size < pos && !verity)  		pagecache_isize_extended(inode, old_size, pos); @@ -1568,6 +1540,7 @@ struct mpage_da_data {  	struct ext4_io_submit io_submit;	/* IO submission data */  	unsigned int do_map:1;  	unsigned int scanned_until_end:1; +	unsigned int journalled_more_data:1;  };  static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1649,12 +1622,6 @@ static void ext4_print_free_blocks(struct inode *inode)  	return;  } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, -				      struct buffer_head *bh) -{ -	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} -  /*   * ext4_insert_delayed_block - adds a delayed block to the extents status   *                             tree, incrementing the reserved cluster/block @@ -1887,249 +1854,41 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,  	return 0;  } -static int __ext4_journalled_writepage(struct page *page, -				       unsigned int len) +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)  { -	struct address_space *mapping = page->mapping; -	struct inode *inode = mapping->host; -	handle_t *handle = NULL; -	int ret = 0, err = 0; -	int inline_data = ext4_has_inline_data(inode); -	struct buffer_head *inode_bh = NULL; -	loff_t size; - -	ClearPageChecked(page); - -	if (inline_data) { -		BUG_ON(page->index != 0); -		BUG_ON(len > ext4_get_max_inline_size(inode)); -		inode_bh = ext4_journalled_write_inline_data(inode, len, page); -		if (inode_bh == NULL) -			goto out; -	} -	/* -	 * We need to release the page lock before we start the -	 * journal, so grab a reference so the page won't disappear -	 * out from under us. -	 */ -	get_page(page); -	unlock_page(page); - -	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, -				    ext4_writepage_trans_blocks(inode)); -	if (IS_ERR(handle)) { -		ret = PTR_ERR(handle); -		put_page(page); -		goto out_no_pagelock; -	} -	BUG_ON(!ext4_handle_valid(handle)); - -	lock_page(page); -	put_page(page); -	size = i_size_read(inode); -	if (page->mapping != mapping || page_offset(page) > size) { -		/* The page got truncated from under us */ -		ext4_journal_stop(handle); -		ret = 0; -		goto out; -	} - -	if (inline_data) { -		ret = ext4_mark_inode_dirty(handle, inode); -	} else { -		struct buffer_head *page_bufs = page_buffers(page); - -		if (page->index == size >> PAGE_SHIFT) -			len = size & ~PAGE_MASK; -		else -			len = PAGE_SIZE; - -		ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -					     NULL, do_journal_get_write_access); - -		err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, -					     NULL, write_end_fn); -	} -	if (ret == 0) -		ret = err; -	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); -	if (ret == 0) -		ret = err; -	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; -	err = ext4_journal_stop(handle); -	if (!ret) -		ret = err; - -	ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: -	unlock_page(page); -out_no_pagelock: -	brelse(inode_bh); -	return ret; +	mpd->first_page += folio_nr_pages(folio); +	folio_unlock(folio);  } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - *   - ext4_writepages after taking page lock (have journal handle) - *   - journal_submit_inode_data_buffers (no journal handle) - *   - shrink_page_list via the kswapd/direct reclaim (no journal handle) - *   - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - *		ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, -			  struct writeback_control *wbc) +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)  { -	struct folio *folio = page_folio(page); -	int ret = 0; -	loff_t size; -	unsigned int len; -	struct buffer_head *page_bufs = NULL; -	struct inode *inode = page->mapping->host; -	struct ext4_io_submit io_submit; - -	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { -		folio_invalidate(folio, 0, folio_size(folio)); -		folio_unlock(folio); -		return -EIO; -	} - -	trace_ext4_writepage(page); -	size = i_size_read(inode); -	if (page->index == size >> PAGE_SHIFT && -	    !ext4_verity_in_progress(inode)) -		len = size & ~PAGE_MASK; -	else -		len = PAGE_SIZE; - -	/* Should never happen but for bugs in other kernel subsystems */ -	if (!page_has_buffers(page)) { -		ext4_warning_inode(inode, -		   "page %lu does not have buffers attached", page->index); -		ClearPageDirty(page); -		unlock_page(page); -		return 0; -	} - -	page_bufs = page_buffers(page); -	/* -	 * We cannot do block allocation or other extent handling in this -	 * function. If there are buffers needing that, we have to redirty -	 * the page. But we may reach here when we do a journal commit via -	 * journal_submit_inode_data_buffers() and in that case we must write -	 * allocated buffers to achieve data=ordered mode guarantees. -	 * -	 * Also, if there is only one buffer per page (the fs block -	 * size == the page size), if one buffer needs block -	 * allocation or needs to modify the extent tree to clear the -	 * unwritten flag, we know that the page can't be written at -	 * all, so we might as well refuse the write immediately. -	 * Unfortunately if the block size != page size, we can't as -	 * easily detect this case using ext4_walk_page_buffers(), but -	 * for the extremely common case, this is an optimization that -	 * skips a useless round trip through ext4_bio_write_page(). -	 */ -	if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, -				   ext4_bh_delay_or_unwritten)) { -		redirty_page_for_writepage(wbc, page); -		if ((current->flags & PF_MEMALLOC) || -		    (inode->i_sb->s_blocksize == PAGE_SIZE)) { -			/* -			 * For memory cleaning there's no point in writing only -			 * some buffers. So just bail out. Warn if we came here -			 * from direct reclaim. -			 */ -			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) -							== PF_MEMALLOC); -			unlock_page(page); -			return 0; -		} -	} - -	if (PageChecked(page) && ext4_should_journal_data(inode)) -		/* -		 * It's mmapped pagecache.  Add buffers and journal it.  There -		 * doesn't seem much point in redirtying the page here. -		 */ -		return __ext4_journalled_writepage(page, len); - -	ext4_io_submit_init(&io_submit, wbc); -	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); -	if (!io_submit.io_end) { -		redirty_page_for_writepage(wbc, page); -		unlock_page(page); -		return -ENOMEM; -	} -	ret = ext4_bio_write_page(&io_submit, page, len); -	ext4_io_submit(&io_submit); -	/* Drop io_end reference we got from init */ -	ext4_put_io_end_defer(io_submit.io_end); -	return ret; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ -	int len; +	size_t len;  	loff_t size;  	int err; -	BUG_ON(page->index != mpd->first_page); -	clear_page_dirty_for_io(page); +	BUG_ON(folio->index != mpd->first_page); +	folio_clear_dirty_for_io(folio);  	/*  	 * We have to be very careful here!  Nothing protects writeback path  	 * against i_size changes and the page can be writeably mapped into  	 * page tables. So an application can be growing i_size and writing -	 * data through mmap while writeback runs. clear_page_dirty_for_io() +	 * data through mmap while writeback runs. folio_clear_dirty_for_io()  	 * write-protects our page in page tables and the page cannot get -	 * written to again until we release page lock. So only after -	 * clear_page_dirty_for_io() we are safe to sample i_size for -	 * ext4_bio_write_page() to zero-out tail of the written page. We rely -	 * on the barrier provided by TestClearPageDirty in -	 * clear_page_dirty_for_io() to make sure i_size is really sampled only +	 * written to again until we release folio lock. So only after +	 * folio_clear_dirty_for_io() we are safe to sample i_size for +	 * ext4_bio_write_folio() to zero-out tail of the written page. We rely +	 * on the barrier provided by folio_test_clear_dirty() in +	 * folio_clear_dirty_for_io() to make sure i_size is really sampled only  	 * after page tables are updated.  	 */  	size = i_size_read(mpd->inode); -	if (page->index == size >> PAGE_SHIFT && +	len = folio_size(folio); +	if (folio_pos(folio) + len > size &&  	    !ext4_verity_in_progress(mpd->inode))  		len = size & ~PAGE_MASK; -	else -		len = PAGE_SIZE; -	err = ext4_bio_write_page(&mpd->io_submit, page, len); +	err = ext4_bio_write_folio(&mpd->io_submit, folio, len);  	if (!err)  		mpd->wbc->nr_to_write--; -	mpd->first_page++;  	return err;  } @@ -2240,9 +1999,10 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,  	} while (lblk++, (bh = bh->b_this_page) != head);  	/* So far everything mapped? Submit the page for IO. */  	if (mpd->map.m_len == 0) { -		err = mpage_submit_page(mpd, head->b_page); +		err = mpage_submit_folio(mpd, head->b_folio);  		if (err < 0)  			return err; +		mpage_folio_done(mpd, head->b_folio);  	}  	if (lblk >= blocks) {  		mpd->scanned_until_end = 1; @@ -2252,21 +2012,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,  }  /* - * mpage_process_page - update page buffers corresponding to changed extent and - *		       may submit fully mapped page for IO - * - * @mpd		- description of extent to map, on return next extent to map - * @m_lblk	- logical block mapping. - * @m_pblk	- corresponding physical mapping. - * @map_bh	- determines on return whether this page requires any further + * mpage_process_folio - update folio buffers corresponding to changed extent + *			 and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further   *		  mapping or not. - * Scan given page buffers corresponding to changed extent and update buffer + * + * Scan given folio buffers corresponding to changed extent and update buffer   * state according to new extent state.   * We map delalloc buffers to their physical location, clear unwritten bits. - * If the given page is not fully mapped, we update @map to the next extent in - * the given page that needs mapping & return @map_bh as true. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true.   */ -static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,  			      ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,  			      bool *map_bh)  { @@ -2279,14 +2040,14 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,  	ssize_t io_end_size = 0;  	struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); -	bh = head = page_buffers(page); +	bh = head = folio_buffers(folio);  	do {  		if (lblk < mpd->map.m_lblk)  			continue;  		if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {  			/*  			 * Buffer after end of mapped extent. -			 * Find next buffer in the page to map. +			 * Find next buffer in the folio to map.  			 */  			mpd->map.m_len = 0;  			mpd->map.m_flags = 0; @@ -2359,9 +2120,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)  		if (nr == 0)  			break;  		for (i = 0; i < nr; i++) { -			struct page *page = &fbatch.folios[i]->page; +			struct folio *folio = fbatch.folios[i]; -			err = mpage_process_page(mpd, page, &lblk, &pblock, +			err = mpage_process_folio(mpd, folio, &lblk, &pblock,  						 &map_bh);  			/*  			 * If map_bh is true, means page may require further bh @@ -2371,9 +2132,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)  			if (err < 0 || map_bh)  				goto out;  			/* Page fully mapped - let IO run! */ -			err = mpage_submit_page(mpd, page); +			err = mpage_submit_folio(mpd, folio);  			if (err < 0)  				goto out; +			mpage_folio_done(mpd, folio);  		}  		folio_batch_release(&fbatch);  	} @@ -2559,17 +2321,45 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)  				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);  } -/* Return true if the page needs to be written as part of transaction commit */ -static bool ext4_page_nomap_can_writeout(struct page *page) +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, +				     int len)  { -	struct buffer_head *bh, *head; +	struct buffer_head *page_bufs = page_buffers(page); +	struct inode *inode = page->mapping->host; +	int ret, err; -	bh = head = page_buffers(page); -	do { -		if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) -			return true; -	} while ((bh = bh->b_this_page) != head); -	return false; +	ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, +				     NULL, do_journal_get_write_access); +	err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, +				     NULL, write_end_fn); +	if (ret == 0) +		ret = err; +	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); +	if (ret == 0) +		ret = err; +	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + +	return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, +				      struct mpage_da_data *mpd, +				      struct page *page) +{ +	struct inode *inode = mpd->inode; +	loff_t size = i_size_read(inode); +	int len; + +	ClearPageChecked(page); +	mpd->wbc->nr_to_write--; + +	if (page->index == size >> PAGE_SHIFT && +	    !ext4_verity_in_progress(inode)) +		len = size & ~PAGE_MASK; +	else +		len = PAGE_SIZE; + +	return ext4_journal_page_buffers(handle, page, len);  }  /* @@ -2597,7 +2387,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  	struct address_space *mapping = mpd->inode->i_mapping;  	struct folio_batch fbatch;  	unsigned int nr_folios; -	long left = mpd->wbc->nr_to_write;  	pgoff_t index = mpd->first_page;  	pgoff_t end = mpd->last_page;  	xa_mark_t tag; @@ -2605,14 +2394,23 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  	int blkbits = mpd->inode->i_blkbits;  	ext4_lblk_t lblk;  	struct buffer_head *head; +	handle_t *handle = NULL; +	int bpp = ext4_journal_blocks_per_page(mpd->inode);  	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)  		tag = PAGECACHE_TAG_TOWRITE;  	else  		tag = PAGECACHE_TAG_DIRTY; -	folio_batch_init(&fbatch); +  	mpd->map.m_len = 0;  	mpd->next_page = index; +	if (ext4_should_journal_data(mpd->inode)) { +		handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, +					    bpp); +		if (IS_ERR(handle)) +			return PTR_ERR(handle); +	} +	folio_batch_init(&fbatch);  	while (index <= end) {  		nr_folios = filemap_get_folios_tag(mapping, &index, end,  				tag, &fbatch); @@ -2630,13 +2428,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  			 * newly appeared dirty pages, but have not synced all  			 * of the old dirty pages.  			 */ -			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) +			if (mpd->wbc->sync_mode == WB_SYNC_NONE && +			    mpd->wbc->nr_to_write <= +			    mpd->map.m_len >> (PAGE_SHIFT - blkbits))  				goto out;  			/* If we can't merge this page, we are done. */  			if (mpd->map.m_len > 0 && mpd->next_page != folio->index)  				goto out; +			if (handle) { +				err = ext4_journal_ensure_credits(handle, bpp, +								  0); +				if (err < 0) +					goto out; +			} +  			folio_lock(folio);  			/*  			 * If the page is no longer dirty, or its mapping no @@ -2676,18 +2483,28 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  				mpd->first_page = folio->index;  			mpd->next_page = folio->index + folio_nr_pages(folio);  			/* -			 * Writeout for transaction commit where we cannot -			 * modify metadata is simple. Just submit the page. +			 * Writeout when we cannot modify metadata is simple. +			 * Just submit the page. For data=journal mode we +			 * first handle writeout of the page for checkpoint and +			 * only after that handle delayed page dirtying. This +			 * makes sure current data is checkpointed to the final +			 * location before possibly journalling it again which +			 * is desirable when the page is frequently dirtied +			 * through a pin.  			 */  			if (!mpd->can_map) { -				if (ext4_page_nomap_can_writeout(&folio->page)) { -					err = mpage_submit_page(mpd, &folio->page); +				err = mpage_submit_folio(mpd, folio); +				if (err < 0) +					goto out; +				/* Pending dirtying of journalled data? */ +				if (folio_test_checked(folio)) { +					err = mpage_journal_page_buffers(handle, +						mpd, &folio->page);  					if (err < 0)  						goto out; -				} else { -					folio_unlock(folio); -					mpd->first_page += folio_nr_pages(folio); +					mpd->journalled_more_data = 1;  				} +				mpage_folio_done(mpd, folio);  			} else {  				/* Add all dirty buffers to mpd */  				lblk = ((ext4_lblk_t)folio->index) << @@ -2699,24 +2516,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)  					goto out;  				err = 0;  			} -			left -= folio_nr_pages(folio);  		}  		folio_batch_release(&fbatch);  		cond_resched();  	}  	mpd->scanned_until_end = 1; +	if (handle) +		ext4_journal_stop(handle);  	return 0;  out:  	folio_batch_release(&fbatch); +	if (handle) +		ext4_journal_stop(handle);  	return err;  } -static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, -			     void *data) -{ -	return ext4_writepage(&folio->page, wbc); -} -  static int ext4_do_writepages(struct mpage_da_data *mpd)  {  	struct writeback_control *wbc = mpd->wbc; @@ -2742,13 +2556,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)  	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))  		goto out_writepages; -	if (ext4_should_journal_data(inode)) { -		blk_start_plug(&plug); -		ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); -		blk_finish_plug(&plug); -		goto out_writepages; -	} -  	/*  	 * If the filesystem has aborted, it is read-only, so return  	 * right away instead of dumping stack traces later on that @@ -2783,6 +2590,26 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)  		ext4_journal_stop(handle);  	} +	/* +	 * data=journal mode does not do delalloc so we just need to writeout / +	 * journal already mapped buffers. On the other hand we need to commit +	 * transaction to make data stable. We expect all the data to be +	 * already in the journal (the only exception are DMA pinned pages +	 * dirtied behind our back) so we commit transaction here and run the +	 * writeback loop to checkpoint them. The checkpointing is not actually +	 * necessary to make data persistent *but* quite a few places (extent +	 * shifting operations, fsverity, ...) depend on being able to drop +	 * pagecache pages after calling filemap_write_and_wait() and for that +	 * checkpointing needs to happen. +	 */ +	if (ext4_should_journal_data(inode)) { +		mpd->can_map = 0; +		if (wbc->sync_mode == WB_SYNC_ALL) +			ext4_fc_commit(sbi->s_journal, +				       EXT4_I(inode)->i_datasync_tid); +	} +	mpd->journalled_more_data = 0; +  	if (ext4_should_dioread_nolock(inode)) {  		/*  		 * We may need to convert up to one extent per block in @@ -2956,13 +2783,21 @@ static int ext4_writepages(struct address_space *mapping,  		.can_map = 1,  	};  	int ret; +	int alloc_ctx;  	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))  		return -EIO; -	percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); +	alloc_ctx = ext4_writepages_down_read(sb);  	ret = ext4_do_writepages(&mpd); -	percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +	/* +	 * For data=journal writeback we could have come across pages marked +	 * for delayed dirtying (PageChecked) which were just added to the +	 * running transaction. Try once more to get them to stable storage. +	 */ +	if (!ret && mpd.journalled_more_data) +		ret = ext4_do_writepages(&mpd); +	ext4_writepages_up_read(sb, alloc_ctx);  	return ret;  } @@ -2990,17 +2825,18 @@ static int ext4_dax_writepages(struct address_space *mapping,  	long nr_to_write = wbc->nr_to_write;  	struct inode *inode = mapping->host;  	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); +	int alloc_ctx;  	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))  		return -EIO; -	percpu_down_read(&sbi->s_writepages_rwsem); +	alloc_ctx = ext4_writepages_down_read(inode->i_sb);  	trace_ext4_writepages(inode, wbc);  	ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);  	trace_ext4_writepages_result(inode, wbc, ret,  				     nr_to_write - wbc->nr_to_write); -	percpu_up_read(&sbi->s_writepages_rwsem); +	ext4_writepages_up_read(inode->i_sb, alloc_ctx);  	return ret;  } @@ -3043,7 +2879,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,  			       struct page **pagep, void **fsdata)  {  	int ret, retries = 0; -	struct page *page; +	struct folio *folio;  	pgoff_t index;  	struct inode *inode = mapping->host; @@ -3070,22 +2906,22 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,  	}  retry: -	page = grab_cache_page_write_begin(mapping, index); -	if (!page) -		return -ENOMEM; +	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, +			mapping_gfp_mask(mapping)); +	if (IS_ERR(folio)) +		return PTR_ERR(folio); -	/* In case writeback began while the page was unlocked */ -	wait_for_stable_page(page); +	/* In case writeback began while the folio was unlocked */ +	folio_wait_stable(folio);  #ifdef CONFIG_FS_ENCRYPTION -	ret = ext4_block_write_begin(page, pos, len, -				     ext4_da_get_block_prep); +	ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);  #else -	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); +	ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);  #endif  	if (ret < 0) { -		unlock_page(page); -		put_page(page); +		folio_unlock(folio); +		folio_put(folio);  		/*  		 * block_write_begin may have instantiated a few blocks  		 * outside i_size.  Trim these off again. Don't need @@ -3100,7 +2936,7 @@ retry:  		return ret;  	} -	*pagep = page; +	*pagep = &folio->page;  	return ret;  } @@ -3148,6 +2984,9 @@ static int ext4_da_write_end(struct file *file,  	    ext4_has_inline_data(inode))  		return ext4_write_inline_data_end(inode, pos, len, copied, page); +	if (unlikely(copied < len) && !PageUptodate(page)) +		copied = 0; +  	start = pos & (PAGE_SIZE - 1);  	end = start + copied - 1; @@ -3159,9 +2998,8 @@ static int ext4_da_write_end(struct file *file,  	 * i_disksize since writeback will push i_disksize upto i_size  	 * eventually. If the end of the current write is > i_size and  	 * inside an allocated block (ext4_da_should_update_i_disksize() -	 * check), we need to update i_disksize here as neither -	 * ext4_writepage() nor certain ext4_writepages() paths not -	 * allocating blocks update i_disksize. +	 * check), we need to update i_disksize here as certain +	 * ext4_writepages() paths not allocating blocks update i_disksize.  	 *  	 * Note that we defer inode dirtying to generic_write_end() /  	 * ext4_da_write_inline_data_end(). @@ -3235,9 +3073,7 @@ int ext4_alloc_da_blocks(struct inode *inode)  static sector_t ext4_bmap(struct address_space *mapping, sector_t block)  {  	struct inode *inode = mapping->host; -	journal_t *journal;  	sector_t ret = 0; -	int err;  	inode_lock_shared(inode);  	/* @@ -3247,45 +3083,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)  		goto out;  	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && -			test_opt(inode->i_sb, DELALLOC)) { +	    (test_opt(inode->i_sb, DELALLOC) || +	     ext4_should_journal_data(inode))) {  		/* -		 * With delalloc we want to sync the file -		 * so that we can make sure we allocate -		 * blocks for file +		 * With delalloc or journalled data we want to sync the file so +		 * that we can make sure we allocate blocks for file and data +		 * is in place for the user to see it  		 */  		filemap_write_and_wait(mapping);  	} -	if (EXT4_JOURNAL(inode) && -	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { -		/* -		 * This is a REALLY heavyweight approach, but the use of -		 * bmap on dirty files is expected to be extremely rare: -		 * only if we run lilo or swapon on a freshly made file -		 * do we expect this to happen. -		 * -		 * (bmap requires CAP_SYS_RAWIO so this does not -		 * represent an unprivileged user DOS attack --- we'd be -		 * in trouble if mortal users could trigger this path at -		 * will.) -		 * -		 * NB. EXT4_STATE_JDATA is not set on files other than -		 * regular files.  If somebody wants to bmap a directory -		 * or symlink and gets confused because the buffer -		 * hasn't yet been flushed to disk, they deserve -		 * everything they get. -		 */ - -		ext4_clear_inode_state(inode, EXT4_STATE_JDATA); -		journal = EXT4_JOURNAL(inode); -		jbd2_journal_lock_updates(journal); -		err = jbd2_journal_flush(journal, 0); -		jbd2_journal_unlock_updates(journal); - -		if (err) -			goto out; -	} -  	ret = iomap_bmap(mapping, block, &ext4_iomap_ops);  out: @@ -3295,17 +3102,16 @@ out:  static int ext4_read_folio(struct file *file, struct folio *folio)  { -	struct page *page = &folio->page;  	int ret = -EAGAIN; -	struct inode *inode = page->mapping->host; +	struct inode *inode = folio->mapping->host; -	trace_ext4_readpage(page); +	trace_ext4_readpage(&folio->page);  	if (ext4_has_inline_data(inode)) -		ret = ext4_readpage_inline(inode, page); +		ret = ext4_readpage_inline(inode, folio);  	if (ret == -EAGAIN) -		return ext4_mpage_readpages(inode, NULL, page); +		return ext4_mpage_readpages(inode, NULL, folio);  	return ret;  } @@ -3571,7 +3377,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,  	 */  	flags &= ~IOMAP_WRITE;  	ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); -	WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); +	WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);  	return ret;  } @@ -3686,24 +3492,26 @@ const struct iomap_ops ext4_iomap_report_ops = {  };  /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive".  We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering.  We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive".  We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode.  So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately.   */  static bool ext4_journalled_dirty_folio(struct address_space *mapping,  		struct folio *folio)  {  	WARN_ON_ONCE(!folio_buffers(folio)); -	folio_set_checked(folio); +	if (folio_maybe_dma_pinned(folio)) +		folio_set_checked(folio);  	return filemap_dirty_folio(mapping, folio);  } @@ -3809,23 +3617,26 @@ static int __ext4_block_zero_page_range(handle_t *handle,  	ext4_lblk_t iblock;  	struct inode *inode = mapping->host;  	struct buffer_head *bh; -	struct page *page; +	struct folio *folio;  	int err = 0; -	page = find_or_create_page(mapping, from >> PAGE_SHIFT, -				   mapping_gfp_constraint(mapping, ~__GFP_FS)); -	if (!page) -		return -ENOMEM; +	folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, +				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, +				    mapping_gfp_constraint(mapping, ~__GFP_FS)); +	if (IS_ERR(folio)) +		return PTR_ERR(folio);  	blocksize = inode->i_sb->s_blocksize;  	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); -	if (!page_has_buffers(page)) -		create_empty_buffers(page, blocksize, 0); +	bh = folio_buffers(folio); +	if (!bh) { +		create_empty_buffers(&folio->page, blocksize, 0); +		bh = folio_buffers(folio); +	}  	/* Find the buffer that contains "offset" */ -	bh = page_buffers(page);  	pos = blocksize;  	while (offset >= pos) {  		bh = bh->b_this_page; @@ -3847,7 +3658,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,  	}  	/* Ok, it's mapped. Make sure it's up-to-date */ -	if (PageUptodate(page)) +	if (folio_test_uptodate(folio))  		set_buffer_uptodate(bh);  	if (!buffer_uptodate(bh)) { @@ -3857,7 +3668,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,  		if (fscrypt_inode_uses_fs_layer_crypto(inode)) {  			/* We expect the key to be set. */  			BUG_ON(!fscrypt_has_encryption_key(inode)); -			err = fscrypt_decrypt_pagecache_blocks(page_folio(page), +			err = fscrypt_decrypt_pagecache_blocks(folio,  							       blocksize,  							       bh_offset(bh));  			if (err) { @@ -3873,11 +3684,11 @@ static int __ext4_block_zero_page_range(handle_t *handle,  		if (err)  			goto unlock;  	} -	zero_user(page, offset, length); +	folio_zero_range(folio, offset, length);  	BUFFER_TRACE(bh, "zeroed end of block");  	if (ext4_should_journal_data(inode)) { -		err = ext4_handle_dirty_metadata(handle, inode, bh); +		err = ext4_dirty_journalled_data(handle, bh);  	} else {  		err = 0;  		mark_buffer_dirty(bh); @@ -3887,8 +3698,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,  	}  unlock: -	unlock_page(page); -	put_page(page); +	folio_unlock(folio); +	folio_put(folio);  	return err;  } @@ -5385,7 +5196,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)  	 * If the folio is fully truncated, we don't need to wait for any commit  	 * (and we even should not as __ext4_journalled_invalidate_folio() may  	 * strip all buffers from the folio but keep the folio dirty which can then -	 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without +	 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without  	 * buffers). Also we don't need to wait for any commit if all buffers in  	 * the folio remain valid. This is most beneficial for the common case of  	 * blocksize == PAGESIZE. @@ -5395,7 +5206,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)  	while (1) {  		struct folio *folio = filemap_lock_folio(inode->i_mapping,  				      inode->i_size >> PAGE_SHIFT); -		if (!folio) +		if (IS_ERR(folio))  			return;  		ret = __ext4_journalled_invalidate_folio(folio, offset,  						folio_size(folio) - offset); @@ -6119,7 +5930,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)  	journal_t *journal;  	handle_t *handle;  	int err; -	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +	int alloc_ctx;  	/*  	 * We have to be very careful here: changing a data block's @@ -6157,7 +5968,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)  		}  	} -	percpu_down_write(&sbi->s_writepages_rwsem); +	alloc_ctx = ext4_writepages_down_write(inode->i_sb);  	jbd2_journal_lock_updates(journal);  	/* @@ -6174,7 +5985,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)  		err = jbd2_journal_flush(journal, 0);  		if (err < 0) {  			jbd2_journal_unlock_updates(journal); -			percpu_up_write(&sbi->s_writepages_rwsem); +			ext4_writepages_up_write(inode->i_sb, alloc_ctx);  			return err;  		}  		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6182,7 +5993,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)  	ext4_set_aops(inode);  	jbd2_journal_unlock_updates(journal); -	percpu_up_write(&sbi->s_writepages_rwsem); +	ext4_writepages_up_write(inode->i_sb, alloc_ctx);  	if (val)  		filemap_invalidate_unlock(inode->i_mapping); @@ -6212,7 +6023,7 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,  vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)  {  	struct vm_area_struct *vma = vmf->vma; -	struct page *page = vmf->page; +	struct folio *folio = page_folio(vmf->page);  	loff_t size;  	unsigned long len;  	int err; @@ -6256,19 +6067,18 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)  		goto out_ret;  	} -	lock_page(page); +	folio_lock(folio);  	size = i_size_read(inode);  	/* Page got truncated from under us? */ -	if (page->mapping != mapping || page_offset(page) > size) { -		unlock_page(page); +	if (folio->mapping != mapping || folio_pos(folio) > size) { +		folio_unlock(folio);  		ret = VM_FAULT_NOPAGE;  		goto out;  	} -	if (page->index == size >> PAGE_SHIFT) -		len = size & ~PAGE_MASK; -	else -		len = PAGE_SIZE; +	len = folio_size(folio); +	if (folio_pos(folio) + len > size) +		len = size - folio_pos(folio);  	/*  	 * Return if we have all the buffers mapped. This avoids the need to do  	 * journal_start/journal_stop which can block and take a long time @@ -6276,17 +6086,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)  	 * This cannot be done for data journalling, as we have to add the  	 * inode to the transaction's list to writeprotect pages on commit.  	 */ -	if (page_has_buffers(page)) { -		if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), +	if (folio_buffers(folio)) { +		if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),  					    0, len, NULL,  					    ext4_bh_unmapped)) {  			/* Wait so that we don't change page under IO */ -			wait_for_stable_page(page); +			folio_wait_stable(folio);  			ret = VM_FAULT_LOCKED;  			goto out;  		}  	} -	unlock_page(page); +	folio_unlock(folio);  	/* OK, we need to fill the hole... */  	if (ext4_should_dioread_nolock(inode))  		get_block = ext4_get_block_unwritten; @@ -6307,36 +6117,25 @@ retry_alloc:  	if (!ext4_should_journal_data(inode)) {  		err = block_page_mkwrite(vma, vmf, get_block);  	} else { -		lock_page(page); +		folio_lock(folio);  		size = i_size_read(inode);  		/* Page got truncated from under us? */ -		if (page->mapping != mapping || page_offset(page) > size) { +		if (folio->mapping != mapping || folio_pos(folio) > size) {  			ret = VM_FAULT_NOPAGE;  			goto out_error;  		} -		if (page->index == size >> PAGE_SHIFT) -			len = size & ~PAGE_MASK; -		else -			len = PAGE_SIZE; +		len = folio_size(folio); +		if (folio_pos(folio) + len > size) +			len = size - folio_pos(folio); -		err = __block_write_begin(page, 0, len, ext4_get_block); +		err = __block_write_begin(&folio->page, 0, len, ext4_get_block);  		if (!err) {  			ret = VM_FAULT_SIGBUS; -			if (ext4_walk_page_buffers(handle, inode, -					page_buffers(page), 0, len, NULL, -					do_journal_get_write_access)) +			if (ext4_journal_page_buffers(handle, &folio->page, len))  				goto out_error; -			if (ext4_walk_page_buffers(handle, inode, -					page_buffers(page), 0, len, NULL, -					write_end_fn)) -				goto out_error; -			if (ext4_jbd2_inode_add_write(handle, inode, -						      page_offset(page), len)) -				goto out_error; -			ext4_set_inode_state(inode, EXT4_STATE_JDATA);  		} else { -			unlock_page(page); +			folio_unlock(folio);  		}  	}  	ext4_journal_stop(handle); @@ -6349,7 +6148,7 @@ out:  	sb_end_pagefault(inode->i_sb);  	return ret;  out_error: -	unlock_page(page); +	folio_unlock(folio);  	ext4_journal_stop(handle);  	goto out;  } |