diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 1036 | 
1 files changed, 484 insertions, 552 deletions
| diff --git a/mm/filemap.c b/mm/filemap.c index 39c4c46c6133..2fd9b2f24025 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,99 +121,97 @@   */  static void page_cache_delete(struct address_space *mapping, -				   struct page *page, void *shadow) +				   struct folio *folio, void *shadow)  { -	XA_STATE(xas, &mapping->i_pages, page->index); -	unsigned int nr = 1; +	XA_STATE(xas, &mapping->i_pages, folio->index); +	long nr = 1;  	mapping_set_update(&xas, mapping);  	/* hugetlb pages are represented by a single entry in the xarray */ -	if (!PageHuge(page)) { -		xas_set_order(&xas, page->index, compound_order(page)); -		nr = compound_nr(page); +	if (!folio_test_hugetlb(folio)) { +		xas_set_order(&xas, folio->index, folio_order(folio)); +		nr = folio_nr_pages(folio);  	} -	VM_BUG_ON_PAGE(!PageLocked(page), page); -	VM_BUG_ON_PAGE(PageTail(page), page); -	VM_BUG_ON_PAGE(nr != 1 && shadow, page); +	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);  	xas_store(&xas, shadow);  	xas_init_marks(&xas); -	page->mapping = NULL; +	folio->mapping = NULL;  	/* Leave page->index set: truncation lookup relies upon it */  	mapping->nrpages -= nr;  } -static void unaccount_page_cache_page(struct address_space *mapping, -				      struct page *page) +static void filemap_unaccount_folio(struct address_space *mapping, +		struct folio *folio)  { -	int nr; +	long nr;  	/*  	 * if we're uptodate, flush out into the cleancache, otherwise  	 * invalidate any existing cleancache entries.  We can't leave  	 * stale data around in the cleancache once our page is gone  	 */ -	if (PageUptodate(page) && PageMappedToDisk(page)) -		cleancache_put_page(page); +	if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) +		cleancache_put_page(&folio->page);  	else -		cleancache_invalidate_page(mapping, page); +		cleancache_invalidate_page(mapping, &folio->page); -	VM_BUG_ON_PAGE(PageTail(page), page); -	VM_BUG_ON_PAGE(page_mapped(page), page); -	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { +	VM_BUG_ON_FOLIO(folio_mapped(folio), folio); +	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {  		int mapcount;  		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n", -			 current->comm, page_to_pfn(page)); -		dump_page(page, "still mapped when deleted"); +			 current->comm, folio_pfn(folio)); +		dump_page(&folio->page, "still mapped when deleted");  		dump_stack();  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); -		mapcount = page_mapcount(page); +		mapcount = page_mapcount(&folio->page);  		if (mapping_exiting(mapping) && -		    page_count(page) >= mapcount + 2) { +		    folio_ref_count(folio) >= mapcount + 2) {  			/*  			 * All vmas have already been torn down, so it's -			 * a good bet that actually the page is unmapped, +			 * a good bet that actually the folio is unmapped,  			 * and we'd prefer not to leak it: if we're wrong,  			 * some other bad page check should catch it later.  			 */ -			page_mapcount_reset(page); -			page_ref_sub(page, mapcount); +			page_mapcount_reset(&folio->page); +			folio_ref_sub(folio, mapcount);  		}  	} -	/* hugetlb pages do not participate in page cache accounting. */ -	if (PageHuge(page)) +	/* hugetlb folios do not participate in page cache accounting. */ +	if (folio_test_hugetlb(folio))  		return; -	nr = thp_nr_pages(page); +	nr = folio_nr_pages(folio); -	__mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); -	if (PageSwapBacked(page)) { -		__mod_lruvec_page_state(page, NR_SHMEM, -nr); -		if (PageTransHuge(page)) -			__mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); -	} else if (PageTransHuge(page)) { -		__mod_lruvec_page_state(page, NR_FILE_THPS, -nr); +	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); +	if (folio_test_swapbacked(folio)) { +		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); +		if (folio_test_pmd_mappable(folio)) +			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); +	} else if (folio_test_pmd_mappable(folio)) { +		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);  		filemap_nr_thps_dec(mapping);  	}  	/* -	 * At this point page must be either written or cleaned by -	 * truncate.  Dirty page here signals a bug and loss of +	 * At this point folio must be either written or cleaned by +	 * truncate.  Dirty folio here signals a bug and loss of  	 * unwritten data.  	 * -	 * This fixes dirty accounting after removing the page entirely -	 * but leaves PageDirty set: it has no effect for truncated -	 * page and anyway will be cleared before returning page into +	 * This fixes dirty accounting after removing the folio entirely +	 * but leaves the dirty flag set: it has no effect for truncated +	 * folio and anyway will be cleared before returning folio to  	 * buddy allocator.  	 */ -	if (WARN_ON_ONCE(PageDirty(page))) -		account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); +	if (WARN_ON_ONCE(folio_test_dirty(folio))) +		folio_account_cleaned(folio, mapping, +					inode_to_wb(mapping->host));  }  /* @@ -221,87 +219,83 @@ static void unaccount_page_cache_page(struct address_space *mapping,   * sure the page is locked and that nobody else uses it - or that usage   * is safe.  The caller must hold the i_pages lock.   */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __filemap_remove_folio(struct folio *folio, void *shadow)  { -	struct address_space *mapping = page->mapping; +	struct address_space *mapping = folio->mapping; -	trace_mm_filemap_delete_from_page_cache(page); - -	unaccount_page_cache_page(mapping, page); -	page_cache_delete(mapping, page, shadow); +	trace_mm_filemap_delete_from_page_cache(folio); +	filemap_unaccount_folio(mapping, folio); +	page_cache_delete(mapping, folio, shadow);  } -static void page_cache_free_page(struct address_space *mapping, -				struct page *page) +void filemap_free_folio(struct address_space *mapping, struct folio *folio)  {  	void (*freepage)(struct page *);  	freepage = mapping->a_ops->freepage;  	if (freepage) -		freepage(page); +		freepage(&folio->page); -	if (PageTransHuge(page) && !PageHuge(page)) { -		page_ref_sub(page, thp_nr_pages(page)); -		VM_BUG_ON_PAGE(page_count(page) <= 0, page); +	if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { +		folio_ref_sub(folio, folio_nr_pages(folio)); +		VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio);  	} else { -		put_page(page); +		folio_put(folio);  	}  }  /** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio.   * - * This must be called only on pages that have been verified to be in the page - * cache and locked.  It will never put the page into the free list, the caller - * has a reference on the page. + * This must be called only on folios that are locked and have been + * verified to be in the page cache.  It will never put the folio into + * the free list because the caller has a reference on the page.   */ -void delete_from_page_cache(struct page *page) +void filemap_remove_folio(struct folio *folio)  { -	struct address_space *mapping = page_mapping(page); +	struct address_space *mapping = folio->mapping; -	BUG_ON(!PageLocked(page)); +	BUG_ON(!folio_test_locked(folio));  	spin_lock(&mapping->host->i_lock);  	xa_lock_irq(&mapping->i_pages); -	__delete_from_page_cache(page, NULL); +	__filemap_remove_folio(folio, NULL);  	xa_unlock_irq(&mapping->i_pages);  	if (mapping_shrinkable(mapping))  		inode_add_lru(mapping->host);  	spin_unlock(&mapping->host->i_lock); -	page_cache_free_page(mapping, page); +	filemap_free_folio(mapping, folio);  } -EXPORT_SYMBOL(delete_from_page_cache);  /* - * page_cache_delete_batch - delete several pages from page cache - * @mapping: the mapping to which pages belong - * @pvec: pagevec with pages to delete + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete   * - * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index - * and is optimised for it to be dense. - * It tolerates holes in @pvec (mapping entries at those indices are not - * modified). The function expects only THP head pages to be present in the - * @pvec. + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified).   *   * The function expects the i_pages lock to be held.   */  static void page_cache_delete_batch(struct address_space *mapping, -			     struct pagevec *pvec) +			     struct folio_batch *fbatch)  { -	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); -	int total_pages = 0; +	XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); +	long total_pages = 0;  	int i = 0; -	struct page *page; +	struct folio *folio;  	mapping_set_update(&xas, mapping); -	xas_for_each(&xas, page, ULONG_MAX) { -		if (i >= pagevec_count(pvec)) +	xas_for_each(&xas, folio, ULONG_MAX) { +		if (i >= folio_batch_count(fbatch))  			break;  		/* A swap/dax/shadow entry got inserted? Skip it. */ -		if (xa_is_value(page)) +		if (xa_is_value(folio))  			continue;  		/*  		 * A page got inserted in our range? Skip it. We have our @@ -310,54 +304,48 @@ static void page_cache_delete_batch(struct address_space *mapping,  		 * means our page has been removed, which shouldn't be  		 * possible because we're holding the PageLock.  		 */ -		if (page != pvec->pages[i]) { -			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, -					page); +		if (folio != fbatch->folios[i]) { +			VM_BUG_ON_FOLIO(folio->index > +					fbatch->folios[i]->index, folio);  			continue;  		} -		WARN_ON_ONCE(!PageLocked(page)); +		WARN_ON_ONCE(!folio_test_locked(folio)); -		if (page->index == xas.xa_index) -			page->mapping = NULL; -		/* Leave page->index set: truncation lookup relies on it */ +		folio->mapping = NULL; +		/* Leave folio->index set: truncation lookup relies on it */ -		/* -		 * Move to the next page in the vector if this is a regular -		 * page or the index is of the last sub-page of this compound -		 * page. -		 */ -		if (page->index + compound_nr(page) - 1 == xas.xa_index) -			i++; +		i++;  		xas_store(&xas, NULL); -		total_pages++; +		total_pages += folio_nr_pages(folio);  	}  	mapping->nrpages -= total_pages;  }  void delete_from_page_cache_batch(struct address_space *mapping, -				  struct pagevec *pvec) +				  struct folio_batch *fbatch)  {  	int i; -	if (!pagevec_count(pvec)) +	if (!folio_batch_count(fbatch))  		return;  	spin_lock(&mapping->host->i_lock);  	xa_lock_irq(&mapping->i_pages); -	for (i = 0; i < pagevec_count(pvec); i++) { -		trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); +	for (i = 0; i < folio_batch_count(fbatch); i++) { +		struct folio *folio = fbatch->folios[i]; -		unaccount_page_cache_page(mapping, pvec->pages[i]); +		trace_mm_filemap_delete_from_page_cache(folio); +		filemap_unaccount_folio(mapping, folio);  	} -	page_cache_delete_batch(mapping, pvec); +	page_cache_delete_batch(mapping, fbatch);  	xa_unlock_irq(&mapping->i_pages);  	if (mapping_shrinkable(mapping))  		inode_add_lru(mapping->host);  	spin_unlock(&mapping->host->i_lock); -	for (i = 0; i < pagevec_count(pvec); i++) -		page_cache_free_page(mapping, pvec->pages[i]); +	for (i = 0; i < folio_batch_count(fbatch); i++) +		filemap_free_folio(mapping, fbatch->folios[i]);  }  int filemap_check_errors(struct address_space *mapping) @@ -646,8 +634,8 @@ static bool mapping_needs_writeback(struct address_space *mapping)  	return mapping->nrpages;  } -static bool filemap_range_has_writeback(struct address_space *mapping, -					loff_t start_byte, loff_t end_byte) +bool filemap_range_has_writeback(struct address_space *mapping, +				 loff_t start_byte, loff_t end_byte)  {  	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);  	pgoff_t max = end_byte >> PAGE_SHIFT; @@ -667,34 +655,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping,  	}  	rcu_read_unlock();  	return page != NULL; - -} - -/** - * filemap_range_needs_writeback - check if range potentially needs writeback - * @mapping:           address space within which to check - * @start_byte:        offset in bytes where the range starts - * @end_byte:          offset in bytes where the range ends (inclusive) - * - * Find at least one page in the range supplied, usually used to check if - * direct writing in this range will trigger a writeback. Used by O_DIRECT - * read/write with IOCB_NOWAIT, to see if the caller needs to do - * filemap_write_and_wait_range() before proceeding. - * - * Return: %true if the caller should do filemap_write_and_wait_range() before - * doing O_DIRECT to a page in this range, %false otherwise. - */ -bool filemap_range_needs_writeback(struct address_space *mapping, -				   loff_t start_byte, loff_t end_byte) -{ -	if (!mapping_needs_writeback(mapping)) -		return false; -	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && -	    !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) -		return false; -	return filemap_range_has_writeback(mapping, start_byte, end_byte);  } -EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); +EXPORT_SYMBOL_GPL(filemap_range_has_writeback);  /**   * filemap_write_and_wait_range - write out & wait on a file range @@ -959,7 +921,7 @@ unlock:  		goto error;  	} -	trace_mm_filemap_add_to_page_cache(&folio->page); +	trace_mm_filemap_add_to_page_cache(folio);  	return 0;  error:  	folio->mapping = NULL; @@ -1259,10 +1221,10 @@ enum behavior {  			 * __folio_lock() waiting on then setting PG_locked.  			 */  	SHARED,		/* Hold ref to page and check the bit when woken, like -			 * wait_on_page_writeback() waiting on PG_writeback. +			 * folio_wait_writeback() waiting on PG_writeback.  			 */  	DROP,		/* Drop ref to page before wait, no check when woken, -			 * like put_and_wait_on_page_locked() on PG_locked. +			 * like folio_put_wait_locked() on PG_locked.  			 */  }; @@ -1439,22 +1401,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr)  EXPORT_SYMBOL(folio_wait_bit_killable);  /** - * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked - * @page: The page to wait for. + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for.   * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).   * - * The caller should hold a reference on @page.  They expect the page to + * The caller should hold a reference on @folio.  They expect the page to   * become unlocked relatively soon, but do not wish to hold up migration - * (for example) by holding the reference while waiting for the page to + * (for example) by holding the reference while waiting for the folio to   * come unlocked.  After this function returns, the caller should not - * dereference @page. + * dereference @folio.   * - * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.   */ -int put_and_wait_on_page_locked(struct page *page, int state) +int folio_put_wait_locked(struct folio *folio, int state)  { -	return folio_wait_bit_common(page_folio(page), PG_locked, state, -			DROP); +	return folio_wait_bit_common(folio, PG_locked, state, DROP);  }  /** @@ -1979,37 +1940,36 @@ no_page:  }  EXPORT_SYMBOL(__filemap_get_folio); -static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,  		xa_mark_t mark)  { -	struct page *page; +	struct folio *folio;  retry:  	if (mark == XA_PRESENT) -		page = xas_find(xas, max); +		folio = xas_find(xas, max);  	else -		page = xas_find_marked(xas, max, mark); +		folio = xas_find_marked(xas, max, mark); -	if (xas_retry(xas, page)) +	if (xas_retry(xas, folio))  		goto retry;  	/*  	 * A shadow entry of a recently evicted page, a swap  	 * entry from shmem/tmpfs or a DAX entry.  Return it  	 * without attempting to raise page count.  	 */ -	if (!page || xa_is_value(page)) -		return page; +	if (!folio || xa_is_value(folio)) +		return folio; -	if (!page_cache_get_speculative(page)) +	if (!folio_try_get_rcu(folio))  		goto reset; -	/* Has the page moved or been split? */ -	if (unlikely(page != xas_reload(xas))) { -		put_page(page); +	if (unlikely(folio != xas_reload(xas))) { +		folio_put(folio);  		goto reset;  	} -	return page; +	return folio;  reset:  	xas_reset(xas);  	goto retry; @@ -2020,56 +1980,36 @@ reset:   * @mapping:	The address_space to search   * @start:	The starting page cache index   * @end:	The final page index (inclusive). - * @pvec:	Where the resulting entries are placed. + * @fbatch:	Where the resulting entries are placed.   * @indices:	The cache indices corresponding to the entries in @entries   *   * find_get_entries() will search for and return a batch of entries in - * the mapping.  The entries are placed in @pvec.  find_get_entries() - * takes a reference on any actual pages it returns. + * the mapping.  The entries are placed in @fbatch.  find_get_entries() + * takes a reference on any actual folios it returns.   * - * The search returns a group of mapping-contiguous page cache entries - * with ascending indexes.  There may be holes in the indices due to - * not-present pages. + * The entries have ascending indexes.  The indices may not be consecutive + * due to not-present entries or large folios.   * - * Any shadow entries of evicted pages, or swap entries from + * Any shadow entries of evicted folios, or swap entries from   * shmem/tmpfs, are included in the returned array.   * - * If it finds a Transparent Huge Page, head or tail, find_get_entries() - * stops at that page: the caller is likely to have a better way to handle - * the compound page as a whole, and then skip its extent, than repeatedly - * calling find_get_entries() to return all its tails. - * - * Return: the number of pages and shadow entries which were found. + * Return: The number of entries which were found.   */  unsigned find_get_entries(struct address_space *mapping, pgoff_t start, -		pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)  {  	XA_STATE(xas, &mapping->i_pages, start); -	struct page *page; -	unsigned int ret = 0; -	unsigned nr_entries = PAGEVEC_SIZE; +	struct folio *folio;  	rcu_read_lock(); -	while ((page = find_get_entry(&xas, end, XA_PRESENT))) { -		/* -		 * Terminate early on finding a THP, to allow the caller to -		 * handle it all at once; but continue if this is hugetlbfs. -		 */ -		if (!xa_is_value(page) && PageTransHuge(page) && -				!PageHuge(page)) { -			page = find_subpage(page, xas.xa_index); -			nr_entries = ret + 1; -		} - -		indices[ret] = xas.xa_index; -		pvec->pages[ret] = page; -		if (++ret == nr_entries) +	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { +		indices[fbatch->nr] = xas.xa_index; +		if (!folio_batch_add(fbatch, folio))  			break;  	}  	rcu_read_unlock(); -	pvec->nr = ret; -	return ret; +	return folio_batch_count(fbatch);  }  /** @@ -2077,63 +2017,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,   * @mapping:	The address_space to search.   * @start:	The starting page cache index.   * @end:	The final page index (inclusive). - * @pvec:	Where the resulting entries are placed. - * @indices:	The cache indices of the entries in @pvec. + * @fbatch:	Where the resulting entries are placed. + * @indices:	The cache indices of the entries in @fbatch.   *   * find_lock_entries() will return a batch of entries from @mapping. - * Swap, shadow and DAX entries are included.  Pages are returned - * locked and with an incremented refcount.  Pages which are locked by - * somebody else or under writeback are skipped.  Only the head page of - * a THP is returned.  Pages which are partially outside the range are - * not returned. + * Swap, shadow and DAX entries are included.  Folios are returned + * locked and with an incremented refcount.  Folios which are locked + * by somebody else or under writeback are skipped.  Folios which are + * partially outside the range are not returned.   *   * The entries have ascending indexes.  The indices may not be consecutive - * due to not-present entries, THP pages, pages which could not be locked - * or pages under writeback. + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback.   *   * Return: The number of entries which were found.   */  unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, -		pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)  {  	XA_STATE(xas, &mapping->i_pages, start); -	struct page *page; +	struct folio *folio;  	rcu_read_lock(); -	while ((page = find_get_entry(&xas, end, XA_PRESENT))) { -		if (!xa_is_value(page)) { -			if (page->index < start) +	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { +		if (!xa_is_value(folio)) { +			if (folio->index < start)  				goto put; -			if (page->index + thp_nr_pages(page) - 1 > end) +			if (folio->index + folio_nr_pages(folio) - 1 > end)  				goto put; -			if (!trylock_page(page)) +			if (!folio_trylock(folio))  				goto put; -			if (page->mapping != mapping || PageWriteback(page)) +			if (folio->mapping != mapping || +			    folio_test_writeback(folio))  				goto unlock; -			VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), -					page); +			VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), +					folio);  		} -		indices[pvec->nr] = xas.xa_index; -		if (!pagevec_add(pvec, page)) +		indices[fbatch->nr] = xas.xa_index; +		if (!folio_batch_add(fbatch, folio))  			break; -		goto next; +		continue;  unlock: -		unlock_page(page); +		folio_unlock(folio);  put: -		put_page(page); -next: -		if (!xa_is_value(page) && PageTransHuge(page)) { -			unsigned int nr_pages = thp_nr_pages(page); - -			/* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ -			xas_set(&xas, page->index + nr_pages); -			if (xas.xa_index < nr_pages) -				break; -		} +		folio_put(folio);  	}  	rcu_read_unlock(); -	return pagevec_count(pvec); +	return folio_batch_count(fbatch); +} + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ +	if (!folio_test_large(folio) || folio_test_hugetlb(folio)) +		return false; +	if (index >= max) +		return false; +	return index < folio->index + folio_nr_pages(folio) - 1;  }  /** @@ -2162,23 +2103,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,  			      struct page **pages)  {  	XA_STATE(xas, &mapping->i_pages, *start); -	struct page *page; +	struct folio *folio;  	unsigned ret = 0;  	if (unlikely(!nr_pages))  		return 0;  	rcu_read_lock(); -	while ((page = find_get_entry(&xas, end, XA_PRESENT))) { +	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {  		/* Skip over shadow, swap and DAX entries */ -		if (xa_is_value(page)) +		if (xa_is_value(folio))  			continue; -		pages[ret] = find_subpage(page, xas.xa_index); +again: +		pages[ret] = folio_file_page(folio, xas.xa_index);  		if (++ret == nr_pages) {  			*start = xas.xa_index + 1;  			goto out;  		} +		if (folio_more_pages(folio, xas.xa_index, end)) { +			xas.xa_index++; +			folio_ref_inc(folio); +			goto again; +		}  	}  	/* @@ -2213,36 +2160,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,  			       unsigned int nr_pages, struct page **pages)  {  	XA_STATE(xas, &mapping->i_pages, index); -	struct page *page; +	struct folio *folio;  	unsigned int ret = 0;  	if (unlikely(!nr_pages))  		return 0;  	rcu_read_lock(); -	for (page = xas_load(&xas); page; page = xas_next(&xas)) { -		if (xas_retry(&xas, page)) +	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { +		if (xas_retry(&xas, folio))  			continue;  		/*  		 * If the entry has been swapped out, we can stop looking.  		 * No current caller is looking for DAX entries.  		 */ -		if (xa_is_value(page)) +		if (xa_is_value(folio))  			break; -		if (!page_cache_get_speculative(page)) +		if (!folio_try_get_rcu(folio))  			goto retry; -		/* Has the page moved or been split? */ -		if (unlikely(page != xas_reload(&xas))) +		if (unlikely(folio != xas_reload(&xas)))  			goto put_page; -		pages[ret] = find_subpage(page, xas.xa_index); +again: +		pages[ret] = folio_file_page(folio, xas.xa_index);  		if (++ret == nr_pages)  			break; +		if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { +			xas.xa_index++; +			folio_ref_inc(folio); +			goto again; +		}  		continue;  put_page: -		put_page(page); +		folio_put(folio);  retry:  		xas_reset(&xas);  	} @@ -2271,25 +2223,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,  			struct page **pages)  {  	XA_STATE(xas, &mapping->i_pages, *index); -	struct page *page; +	struct folio *folio;  	unsigned ret = 0;  	if (unlikely(!nr_pages))  		return 0;  	rcu_read_lock(); -	while ((page = find_get_entry(&xas, end, tag))) { +	while ((folio = find_get_entry(&xas, end, tag))) {  		/*  		 * Shadow entries should never be tagged, but this iteration  		 * is lockless so there is a window for page reclaim to evict  		 * a page we saw tagged.  Skip over it.  		 */ -		if (xa_is_value(page)) +		if (xa_is_value(folio))  			continue; -		pages[ret] = page; +		pages[ret] = &folio->page;  		if (++ret == nr_pages) { -			*index = page->index + thp_nr_pages(page); +			*index = folio->index + folio_nr_pages(folio);  			goto out;  		}  	} @@ -2332,52 +2284,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)  }  /* - * filemap_get_read_batch - Get a batch of pages for read + * filemap_get_read_batch - Get a batch of folios for read   * - * Get a batch of pages which represent a contiguous range of bytes - * in the file.  No tail pages will be returned.  If @index is in the - * middle of a THP, the entire THP will be returned.  The last page in - * the batch may have Readahead set or be not Uptodate so that the - * caller can take the appropriate action. + * Get a batch of folios which represent a contiguous range of bytes in + * the file.  No exceptional entries will be returned.  If @index is in + * the middle of a folio, the entire folio will be returned.  The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action.   */  static void filemap_get_read_batch(struct address_space *mapping, -		pgoff_t index, pgoff_t max, struct pagevec *pvec) +		pgoff_t index, pgoff_t max, struct folio_batch *fbatch)  {  	XA_STATE(xas, &mapping->i_pages, index); -	struct page *head; +	struct folio *folio;  	rcu_read_lock(); -	for (head = xas_load(&xas); head; head = xas_next(&xas)) { -		if (xas_retry(&xas, head)) +	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { +		if (xas_retry(&xas, folio))  			continue; -		if (xas.xa_index > max || xa_is_value(head)) +		if (xas.xa_index > max || xa_is_value(folio))  			break; -		if (!page_cache_get_speculative(head)) +		if (!folio_try_get_rcu(folio))  			goto retry; -		/* Has the page moved or been split? */ -		if (unlikely(head != xas_reload(&xas))) -			goto put_page; +		if (unlikely(folio != xas_reload(&xas))) +			goto put_folio; -		if (!pagevec_add(pvec, head)) +		if (!folio_batch_add(fbatch, folio))  			break; -		if (!PageUptodate(head)) +		if (!folio_test_uptodate(folio))  			break; -		if (PageReadahead(head)) +		if (folio_test_readahead(folio))  			break; -		xas.xa_index = head->index + thp_nr_pages(head) - 1; -		xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; +		xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);  		continue; -put_page: -		put_page(head); +put_folio: +		folio_put(folio);  retry:  		xas_reset(&xas);  	}  	rcu_read_unlock();  } -static int filemap_read_page(struct file *file, struct address_space *mapping, -		struct page *page) +static int filemap_read_folio(struct file *file, struct address_space *mapping, +		struct folio *folio)  {  	int error; @@ -2386,52 +2336,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,  	 * eg. multipath errors.  PG_error will be set again if readpage  	 * fails.  	 */ -	ClearPageError(page); +	folio_clear_error(folio);  	/* Start the actual read. The read will unlock the page. */ -	error = mapping->a_ops->readpage(file, page); +	error = mapping->a_ops->readpage(file, &folio->page);  	if (error)  		return error; -	error = wait_on_page_locked_killable(page); +	error = folio_wait_locked_killable(folio);  	if (error)  		return error; -	if (PageUptodate(page)) +	if (folio_test_uptodate(folio))  		return 0;  	shrink_readahead_size_eio(&file->f_ra);  	return -EIO;  }  static bool filemap_range_uptodate(struct address_space *mapping, -		loff_t pos, struct iov_iter *iter, struct page *page) +		loff_t pos, struct iov_iter *iter, struct folio *folio)  {  	int count; -	if (PageUptodate(page)) +	if (folio_test_uptodate(folio))  		return true;  	/* pipes can't handle partially uptodate pages */  	if (iov_iter_is_pipe(iter))  		return false;  	if (!mapping->a_ops->is_partially_uptodate)  		return false; -	if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) +	if (mapping->host->i_blkbits >= folio_shift(folio))  		return false;  	count = iter->count; -	if (page_offset(page) > pos) { -		count -= page_offset(page) - pos; +	if (folio_pos(folio) > pos) { +		count -= folio_pos(folio) - pos;  		pos = 0;  	} else { -		pos -= page_offset(page); +		pos -= folio_pos(folio);  	} -	return mapping->a_ops->is_partially_uptodate(page, pos, count); +	return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count);  }  static int filemap_update_page(struct kiocb *iocb,  		struct address_space *mapping, struct iov_iter *iter, -		struct page *page) +		struct folio *folio)  { -	struct folio *folio = page_folio(page);  	int error;  	if (iocb->ki_flags & IOCB_NOWAIT) { @@ -2447,7 +2396,11 @@ static int filemap_update_page(struct kiocb *iocb,  			goto unlock_mapping;  		if (!(iocb->ki_flags & IOCB_WAITQ)) {  			filemap_invalidate_unlock_shared(mapping); -			put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE); +			/* +			 * This is where we usually end up waiting for a +			 * previously submitted readahead to finish. +			 */ +			folio_put_wait_locked(folio, TASK_KILLABLE);  			return AOP_TRUNCATED_PAGE;  		}  		error = __folio_lock_async(folio, iocb->ki_waitq); @@ -2460,14 +2413,14 @@ static int filemap_update_page(struct kiocb *iocb,  		goto unlock;  	error = 0; -	if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page)) +	if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))  		goto unlock;  	error = -EAGAIN;  	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))  		goto unlock; -	error = filemap_read_page(iocb->ki_filp, mapping, &folio->page); +	error = filemap_read_folio(iocb->ki_filp, mapping, folio);  	goto unlock_mapping;  unlock:  	folio_unlock(folio); @@ -2478,70 +2431,72 @@ unlock_mapping:  	return error;  } -static int filemap_create_page(struct file *file, +static int filemap_create_folio(struct file *file,  		struct address_space *mapping, pgoff_t index, -		struct pagevec *pvec) +		struct folio_batch *fbatch)  { -	struct page *page; +	struct folio *folio;  	int error; -	page = page_cache_alloc(mapping); -	if (!page) +	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); +	if (!folio)  		return -ENOMEM;  	/* -	 * Protect against truncate / hole punch. Grabbing invalidate_lock here -	 * assures we cannot instantiate and bring uptodate new pagecache pages -	 * after evicting page cache during truncate and before actually -	 * freeing blocks.  Note that we could release invalidate_lock after -	 * inserting the page into page cache as the locked page would then be -	 * enough to synchronize with hole punching. But there are code paths -	 * such as filemap_update_page() filling in partially uptodate pages or -	 * ->readpages() that need to hold invalidate_lock while mapping blocks -	 * for IO so let's hold the lock here as well to keep locking rules -	 * simple. +	 * Protect against truncate / hole punch. Grabbing invalidate_lock +	 * here assures we cannot instantiate and bring uptodate new +	 * pagecache folios after evicting page cache during truncate +	 * and before actually freeing blocks.	Note that we could +	 * release invalidate_lock after inserting the folio into +	 * the page cache as the locked folio would then be enough to +	 * synchronize with hole punching. But there are code paths +	 * such as filemap_update_page() filling in partially uptodate +	 * pages or ->readpages() that need to hold invalidate_lock +	 * while mapping blocks for IO so let's hold the lock here as +	 * well to keep locking rules simple.  	 */  	filemap_invalidate_lock_shared(mapping); -	error = add_to_page_cache_lru(page, mapping, index, +	error = filemap_add_folio(mapping, folio, index,  			mapping_gfp_constraint(mapping, GFP_KERNEL));  	if (error == -EEXIST)  		error = AOP_TRUNCATED_PAGE;  	if (error)  		goto error; -	error = filemap_read_page(file, mapping, page); +	error = filemap_read_folio(file, mapping, folio);  	if (error)  		goto error;  	filemap_invalidate_unlock_shared(mapping); -	pagevec_add(pvec, page); +	folio_batch_add(fbatch, folio);  	return 0;  error:  	filemap_invalidate_unlock_shared(mapping); -	put_page(page); +	folio_put(folio);  	return error;  }  static int filemap_readahead(struct kiocb *iocb, struct file *file, -		struct address_space *mapping, struct page *page, +		struct address_space *mapping, struct folio *folio,  		pgoff_t last_index)  { +	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); +  	if (iocb->ki_flags & IOCB_NOIO)  		return -EAGAIN; -	page_cache_async_readahead(mapping, &file->f_ra, file, page, -			page->index, last_index - page->index); +	page_cache_async_ra(&ractl, folio, last_index - folio->index);  	return 0;  }  static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, -		struct pagevec *pvec) +		struct folio_batch *fbatch)  {  	struct file *filp = iocb->ki_filp;  	struct address_space *mapping = filp->f_mapping;  	struct file_ra_state *ra = &filp->f_ra;  	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;  	pgoff_t last_index; -	struct page *page; +	struct folio *folio;  	int err = 0;  	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); @@ -2549,34 +2504,35 @@ retry:  	if (fatal_signal_pending(current))  		return -EINTR; -	filemap_get_read_batch(mapping, index, last_index, pvec); -	if (!pagevec_count(pvec)) { +	filemap_get_read_batch(mapping, index, last_index, fbatch); +	if (!folio_batch_count(fbatch)) {  		if (iocb->ki_flags & IOCB_NOIO)  			return -EAGAIN;  		page_cache_sync_readahead(mapping, ra, filp, index,  				last_index - index); -		filemap_get_read_batch(mapping, index, last_index, pvec); +		filemap_get_read_batch(mapping, index, last_index, fbatch);  	} -	if (!pagevec_count(pvec)) { +	if (!folio_batch_count(fbatch)) {  		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))  			return -EAGAIN; -		err = filemap_create_page(filp, mapping, -				iocb->ki_pos >> PAGE_SHIFT, pvec); +		err = filemap_create_folio(filp, mapping, +				iocb->ki_pos >> PAGE_SHIFT, fbatch);  		if (err == AOP_TRUNCATED_PAGE)  			goto retry;  		return err;  	} -	page = pvec->pages[pagevec_count(pvec) - 1]; -	if (PageReadahead(page)) { -		err = filemap_readahead(iocb, filp, mapping, page, last_index); +	folio = fbatch->folios[folio_batch_count(fbatch) - 1]; +	if (folio_test_readahead(folio)) { +		err = filemap_readahead(iocb, filp, mapping, folio, last_index);  		if (err)  			goto err;  	} -	if (!PageUptodate(page)) { -		if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) +	if (!folio_test_uptodate(folio)) { +		if ((iocb->ki_flags & IOCB_WAITQ) && +		    folio_batch_count(fbatch) > 1)  			iocb->ki_flags |= IOCB_NOWAIT; -		err = filemap_update_page(iocb, mapping, iter, page); +		err = filemap_update_page(iocb, mapping, iter, folio);  		if (err)  			goto err;  	} @@ -2584,8 +2540,8 @@ retry:  	return 0;  err:  	if (err < 0) -		put_page(page); -	if (likely(--pvec->nr)) +		folio_put(folio); +	if (likely(--fbatch->nr))  		return 0;  	if (err == AOP_TRUNCATED_PAGE)  		goto retry; @@ -2612,7 +2568,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  	struct file_ra_state *ra = &filp->f_ra;  	struct address_space *mapping = filp->f_mapping;  	struct inode *inode = mapping->host; -	struct pagevec pvec; +	struct folio_batch fbatch;  	int i, error = 0;  	bool writably_mapped;  	loff_t isize, end_offset; @@ -2623,7 +2579,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  		return 0;  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes); -	pagevec_init(&pvec); +	folio_batch_init(&fbatch);  	do {  		cond_resched(); @@ -2639,7 +2595,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  		if (unlikely(iocb->ki_pos >= i_size_read(inode)))  			break; -		error = filemap_get_pages(iocb, iter, &pvec); +		error = filemap_get_pages(iocb, iter, &fbatch);  		if (error < 0)  			break; @@ -2653,7 +2609,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  		 */  		isize = i_size_read(inode);  		if (unlikely(iocb->ki_pos >= isize)) -			goto put_pages; +			goto put_folios;  		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);  		/* @@ -2668,33 +2624,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  		 */  		if (iocb->ki_pos >> PAGE_SHIFT !=  		    ra->prev_pos >> PAGE_SHIFT) -			mark_page_accessed(pvec.pages[0]); +			folio_mark_accessed(fbatch.folios[0]); -		for (i = 0; i < pagevec_count(&pvec); i++) { -			struct page *page = pvec.pages[i]; -			size_t page_size = thp_size(page); -			size_t offset = iocb->ki_pos & (page_size - 1); +		for (i = 0; i < folio_batch_count(&fbatch); i++) { +			struct folio *folio = fbatch.folios[i]; +			size_t fsize = folio_size(folio); +			size_t offset = iocb->ki_pos & (fsize - 1);  			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, -					     page_size - offset); +					     fsize - offset);  			size_t copied; -			if (end_offset < page_offset(page)) +			if (end_offset < folio_pos(folio))  				break;  			if (i > 0) -				mark_page_accessed(page); +				folio_mark_accessed(folio);  			/* -			 * If users can be writing to this page using arbitrary -			 * virtual addresses, take care about potential aliasing -			 * before reading the page on the kernel side. +			 * If users can be writing to this folio using arbitrary +			 * virtual addresses, take care of potential aliasing +			 * before reading the folio on the kernel side.  			 */ -			if (writably_mapped) { -				int j; +			if (writably_mapped) +				flush_dcache_folio(folio); -				for (j = 0; j < thp_nr_pages(page); j++) -					flush_dcache_page(page + j); -			} - -			copied = copy_page_to_iter(page, offset, bytes, iter); +			copied = copy_folio_to_iter(folio, offset, bytes, iter);  			already_read += copied;  			iocb->ki_pos += copied; @@ -2705,10 +2657,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,  				break;  			}  		} -put_pages: -		for (i = 0; i < pagevec_count(&pvec); i++) -			put_page(pvec.pages[i]); -		pagevec_reinit(&pvec); +put_folios: +		for (i = 0; i < folio_batch_count(&fbatch); i++) +			folio_put(fbatch.folios[i]); +		folio_batch_init(&fbatch);  	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);  	file_accessed(filp); @@ -2793,44 +2745,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)  }  EXPORT_SYMBOL(generic_file_read_iter); -static inline loff_t page_seek_hole_data(struct xa_state *xas, -		struct address_space *mapping, struct page *page, +static inline loff_t folio_seek_hole_data(struct xa_state *xas, +		struct address_space *mapping, struct folio *folio,  		loff_t start, loff_t end, bool seek_data)  {  	const struct address_space_operations *ops = mapping->a_ops;  	size_t offset, bsz = i_blocksize(mapping->host); -	if (xa_is_value(page) || PageUptodate(page)) +	if (xa_is_value(folio) || folio_test_uptodate(folio))  		return seek_data ? start : end;  	if (!ops->is_partially_uptodate)  		return seek_data ? end : start;  	xas_pause(xas);  	rcu_read_unlock(); -	lock_page(page); -	if (unlikely(page->mapping != mapping)) +	folio_lock(folio); +	if (unlikely(folio->mapping != mapping))  		goto unlock; -	offset = offset_in_thp(page, start) & ~(bsz - 1); +	offset = offset_in_folio(folio, start) & ~(bsz - 1);  	do { -		if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) +		if (ops->is_partially_uptodate(&folio->page, offset, bsz) == +							seek_data)  			break;  		start = (start + bsz) & ~(bsz - 1);  		offset += bsz; -	} while (offset < thp_size(page)); +	} while (offset < folio_size(folio));  unlock: -	unlock_page(page); +	folio_unlock(folio);  	rcu_read_lock();  	return start;  } -static inline -unsigned int seek_page_size(struct xa_state *xas, struct page *page) +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)  { -	if (xa_is_value(page)) +	if (xa_is_value(folio))  		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); -	return thp_size(page); +	return folio_size(folio);  }  /** @@ -2857,15 +2809,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,  	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);  	pgoff_t max = (end - 1) >> PAGE_SHIFT;  	bool seek_data = (whence == SEEK_DATA); -	struct page *page; +	struct folio *folio;  	if (end <= start)  		return -ENXIO;  	rcu_read_lock(); -	while ((page = find_get_entry(&xas, max, XA_PRESENT))) { +	while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {  		loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; -		unsigned int seek_size; +		size_t seek_size;  		if (start < pos) {  			if (!seek_data) @@ -2873,9 +2825,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,  			start = pos;  		} -		seek_size = seek_page_size(&xas, page); -		pos = round_up(pos + 1, seek_size); -		start = page_seek_hole_data(&xas, mapping, page, start, pos, +		seek_size = seek_folio_size(&xas, folio); +		pos = round_up((u64)pos + 1, seek_size); +		start = folio_seek_hole_data(&xas, mapping, folio, start, pos,  				seek_data);  		if (start < pos)  			goto unlock; @@ -2883,15 +2835,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,  			break;  		if (seek_size > PAGE_SIZE)  			xas_set(&xas, pos >> PAGE_SHIFT); -		if (!xa_is_value(page)) -			put_page(page); +		if (!xa_is_value(folio)) +			folio_put(folio);  	}  	if (seek_data)  		start = -ENXIO;  unlock:  	rcu_read_unlock(); -	if (page && !xa_is_value(page)) -		put_page(page); +	if (folio && !xa_is_value(folio)) +		folio_put(folio);  	if (start > end)  		return end;  	return start; @@ -2900,21 +2852,20 @@ unlock:  #ifdef CONFIG_MMU  #define MMAP_LOTSAMISS  (100)  /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock   * @vmf - the vm_fault for this fault. - * @page - the page to lock. + * @folio - the folio to lock.   * @fpin - the pointer to the file we may pin (or is already pinned).   * - * This works similar to lock_page_or_retry in that it can drop the mmap_lock. - * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page.  If we did have to drop the mmap_lock then fpin - * will point to the pinned file and needs to be fput()'ed at a later point. + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock.  It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio.  If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point.   */ -static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,  				     struct file **fpin)  { -	struct folio *folio = page_folio(page); -  	if (folio_trylock(folio))  		return 1; @@ -3003,25 +2954,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)   * was pinned if we have to drop the mmap_lock in order to do IO.   */  static struct file *do_async_mmap_readahead(struct vm_fault *vmf, -					    struct page *page) +					    struct folio *folio)  {  	struct file *file = vmf->vma->vm_file;  	struct file_ra_state *ra = &file->f_ra; -	struct address_space *mapping = file->f_mapping; +	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);  	struct file *fpin = NULL;  	unsigned int mmap_miss; -	pgoff_t offset = vmf->pgoff;  	/* If we don't want any read-ahead, don't bother */  	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)  		return fpin; +  	mmap_miss = READ_ONCE(ra->mmap_miss);  	if (mmap_miss)  		WRITE_ONCE(ra->mmap_miss, --mmap_miss); -	if (PageReadahead(page)) { + +	if (folio_test_readahead(folio)) {  		fpin = maybe_unlock_mmap_for_io(vmf, fpin); -		page_cache_async_readahead(mapping, ra, file, -					   page, offset, ra->ra_pages); +		page_cache_async_ra(&ractl, folio, ra->ra_pages);  	}  	return fpin;  } @@ -3040,7 +2991,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,   * vma->vm_mm->mmap_lock must be held on entry.   *   * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock - * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().   *   * If our return value does not have VM_FAULT_RETRY set, the mmap_lock   * has not been released. @@ -3056,28 +3007,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)  	struct file *fpin = NULL;  	struct address_space *mapping = file->f_mapping;  	struct inode *inode = mapping->host; -	pgoff_t offset = vmf->pgoff; -	pgoff_t max_off; -	struct page *page; +	pgoff_t max_idx, index = vmf->pgoff; +	struct folio *folio;  	vm_fault_t ret = 0;  	bool mapping_locked = false; -	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); -	if (unlikely(offset >= max_off)) +	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); +	if (unlikely(index >= max_idx))  		return VM_FAULT_SIGBUS;  	/*  	 * Do we have something in the page cache already?  	 */ -	page = find_get_page(mapping, offset); -	if (likely(page)) { +	folio = filemap_get_folio(mapping, index); +	if (likely(folio)) {  		/*  		 * We found the page, so try async readahead before waiting for  		 * the lock.  		 */  		if (!(vmf->flags & FAULT_FLAG_TRIED)) -			fpin = do_async_mmap_readahead(vmf, page); -		if (unlikely(!PageUptodate(page))) { +			fpin = do_async_mmap_readahead(vmf, folio); +		if (unlikely(!folio_test_uptodate(folio))) {  			filemap_invalidate_lock_shared(mapping);  			mapping_locked = true;  		} @@ -3089,17 +3039,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)  		fpin = do_sync_mmap_readahead(vmf);  retry_find:  		/* -		 * See comment in filemap_create_page() why we need +		 * See comment in filemap_create_folio() why we need  		 * invalidate_lock  		 */  		if (!mapping_locked) {  			filemap_invalidate_lock_shared(mapping);  			mapping_locked = true;  		} -		page = pagecache_get_page(mapping, offset, +		folio = __filemap_get_folio(mapping, index,  					  FGP_CREAT|FGP_FOR_MMAP,  					  vmf->gfp_mask); -		if (!page) { +		if (!folio) {  			if (fpin)  				goto out_retry;  			filemap_invalidate_unlock_shared(mapping); @@ -3107,22 +3057,22 @@ retry_find:  		}  	} -	if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) +	if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))  		goto out_retry;  	/* Did it get truncated? */ -	if (unlikely(compound_head(page)->mapping != mapping)) { -		unlock_page(page); -		put_page(page); +	if (unlikely(folio->mapping != mapping)) { +		folio_unlock(folio); +		folio_put(folio);  		goto retry_find;  	} -	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); +	VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);  	/*  	 * We have a locked page in the page cache, now we need to check  	 * that it's up-to-date. If not, it is going to be due to an error.  	 */ -	if (unlikely(!PageUptodate(page))) { +	if (unlikely(!folio_test_uptodate(folio))) {  		/*  		 * The page was in cache and uptodate and now it is not.  		 * Strange but possible since we didn't hold the page lock all @@ -3130,8 +3080,8 @@ retry_find:  		 * try again.  		 */  		if (!mapping_locked) { -			unlock_page(page); -			put_page(page); +			folio_unlock(folio); +			folio_put(folio);  			goto retry_find;  		}  		goto page_not_uptodate; @@ -3143,7 +3093,7 @@ retry_find:  	 * redo the fault.  	 */  	if (fpin) { -		unlock_page(page); +		folio_unlock(folio);  		goto out_retry;  	}  	if (mapping_locked) @@ -3153,14 +3103,14 @@ retry_find:  	 * Found the page and have a reference on it.  	 * We must recheck i_size under page lock.  	 */ -	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); -	if (unlikely(offset >= max_off)) { -		unlock_page(page); -		put_page(page); +	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); +	if (unlikely(index >= max_idx)) { +		folio_unlock(folio); +		folio_put(folio);  		return VM_FAULT_SIGBUS;  	} -	vmf->page = page; +	vmf->page = folio_file_page(folio, index);  	return ret | VM_FAULT_LOCKED;  page_not_uptodate: @@ -3171,10 +3121,10 @@ page_not_uptodate:  	 * and we need to check for errors.  	 */  	fpin = maybe_unlock_mmap_for_io(vmf, fpin); -	error = filemap_read_page(file, mapping, page); +	error = filemap_read_folio(file, mapping, folio);  	if (fpin)  		goto out_retry; -	put_page(page); +	folio_put(folio);  	if (!error || error == AOP_TRUNCATED_PAGE)  		goto retry_find; @@ -3188,8 +3138,8 @@ out_retry:  	 * re-find the vma and come back and find our hopefully still populated  	 * page.  	 */ -	if (page) -		put_page(page); +	if (folio) +		folio_put(folio);  	if (mapping_locked)  		filemap_invalidate_unlock_shared(mapping);  	if (fpin) @@ -3231,48 +3181,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)  	return false;  } -static struct page *next_uptodate_page(struct page *page, +static struct folio *next_uptodate_page(struct folio *folio,  				       struct address_space *mapping,  				       struct xa_state *xas, pgoff_t end_pgoff)  {  	unsigned long max_idx;  	do { -		if (!page) +		if (!folio)  			return NULL; -		if (xas_retry(xas, page)) +		if (xas_retry(xas, folio))  			continue; -		if (xa_is_value(page)) +		if (xa_is_value(folio))  			continue; -		if (PageLocked(page)) +		if (folio_test_locked(folio))  			continue; -		if (!page_cache_get_speculative(page)) +		if (!folio_try_get_rcu(folio))  			continue;  		/* Has the page moved or been split? */ -		if (unlikely(page != xas_reload(xas))) +		if (unlikely(folio != xas_reload(xas)))  			goto skip; -		if (!PageUptodate(page) || PageReadahead(page)) +		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))  			goto skip; -		if (!trylock_page(page)) +		if (!folio_trylock(folio))  			goto skip; -		if (page->mapping != mapping) +		if (folio->mapping != mapping)  			goto unlock; -		if (!PageUptodate(page)) +		if (!folio_test_uptodate(folio))  			goto unlock;  		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);  		if (xas->xa_index >= max_idx)  			goto unlock; -		return page; +		return folio;  unlock: -		unlock_page(page); +		folio_unlock(folio);  skip: -		put_page(page); -	} while ((page = xas_next_entry(xas, end_pgoff)) != NULL); +		folio_put(folio); +	} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);  	return NULL;  } -static inline struct page *first_map_page(struct address_space *mapping, +static inline struct folio *first_map_page(struct address_space *mapping,  					  struct xa_state *xas,  					  pgoff_t end_pgoff)  { @@ -3280,7 +3230,7 @@ static inline struct page *first_map_page(struct address_space *mapping,  				  mapping, xas, end_pgoff);  } -static inline struct page *next_map_page(struct address_space *mapping, +static inline struct folio *next_map_page(struct address_space *mapping,  					 struct xa_state *xas,  					 pgoff_t end_pgoff)  { @@ -3297,16 +3247,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,  	pgoff_t last_pgoff = start_pgoff;  	unsigned long addr;  	XA_STATE(xas, &mapping->i_pages, start_pgoff); -	struct page *head, *page; +	struct folio *folio; +	struct page *page;  	unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);  	vm_fault_t ret = 0;  	rcu_read_lock(); -	head = first_map_page(mapping, &xas, end_pgoff); -	if (!head) +	folio = first_map_page(mapping, &xas, end_pgoff); +	if (!folio)  		goto out; -	if (filemap_map_pmd(vmf, head)) { +	if (filemap_map_pmd(vmf, &folio->page)) {  		ret = VM_FAULT_NOPAGE;  		goto out;  	} @@ -3314,7 +3265,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,  	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);  	do { -		page = find_subpage(head, xas.xa_index); +again: +		page = folio_file_page(folio, xas.xa_index);  		if (PageHWPoison(page))  			goto unlock; @@ -3335,12 +3287,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,  		do_set_pte(vmf, page, addr);  		/* no need to invalidate: a not-present page won't be cached */  		update_mmu_cache(vma, addr, vmf->pte); -		unlock_page(head); +		if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { +			xas.xa_index++; +			folio_ref_inc(folio); +			goto again; +		} +		folio_unlock(folio);  		continue;  unlock: -		unlock_page(head); -		put_page(head); -	} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); +		if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { +			xas.xa_index++; +			goto again; +		} +		folio_unlock(folio); +		folio_put(folio); +	} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);  	pte_unmap_unlock(vmf->pte, vmf->ptl);  out:  	rcu_read_unlock(); @@ -3352,24 +3313,24 @@ EXPORT_SYMBOL(filemap_map_pages);  vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)  {  	struct address_space *mapping = vmf->vma->vm_file->f_mapping; -	struct page *page = vmf->page; +	struct folio *folio = page_folio(vmf->page);  	vm_fault_t ret = VM_FAULT_LOCKED;  	sb_start_pagefault(mapping->host->i_sb);  	file_update_time(vmf->vma->vm_file); -	lock_page(page); -	if (page->mapping != mapping) { -		unlock_page(page); +	folio_lock(folio); +	if (folio->mapping != mapping) { +		folio_unlock(folio);  		ret = VM_FAULT_NOPAGE;  		goto out;  	}  	/* -	 * We mark the page dirty already here so that when freeze is in +	 * We mark the folio dirty already here so that when freeze is in  	 * progress, we are guaranteed that writeback during freezing will -	 * see the dirty page and writeprotect it again. +	 * see the dirty folio and writeprotect it again.  	 */ -	set_page_dirty(page); -	wait_for_stable_page(page); +	folio_mark_dirty(folio); +	folio_wait_stable(folio);  out:  	sb_end_pagefault(mapping->host->i_sb);  	return ret; @@ -3422,35 +3383,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite);  EXPORT_SYMBOL(generic_file_mmap);  EXPORT_SYMBOL(generic_file_readonly_mmap); -static struct page *wait_on_page_read(struct page *page) +static struct folio *do_read_cache_folio(struct address_space *mapping, +		pgoff_t index, filler_t filler, void *data, gfp_t gfp)  { -	if (!IS_ERR(page)) { -		wait_on_page_locked(page); -		if (!PageUptodate(page)) { -			put_page(page); -			page = ERR_PTR(-EIO); -		} -	} -	return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, -				pgoff_t index, -				int (*filler)(void *, struct page *), -				void *data, -				gfp_t gfp) -{ -	struct page *page; +	struct folio *folio;  	int err;  repeat: -	page = find_get_page(mapping, index); -	if (!page) { -		page = __page_cache_alloc(gfp); -		if (!page) +	folio = filemap_get_folio(mapping, index); +	if (!folio) { +		folio = filemap_alloc_folio(gfp, 0); +		if (!folio)  			return ERR_PTR(-ENOMEM); -		err = add_to_page_cache_lru(page, mapping, index, gfp); +		err = filemap_add_folio(mapping, folio, index, gfp);  		if (unlikely(err)) { -			put_page(page); +			folio_put(folio);  			if (err == -EEXIST)  				goto repeat;  			/* Presumably ENOMEM for xarray node */ @@ -3459,71 +3405,41 @@ repeat:  filler:  		if (filler) -			err = filler(data, page); +			err = filler(data, &folio->page);  		else -			err = mapping->a_ops->readpage(data, page); +			err = mapping->a_ops->readpage(data, &folio->page);  		if (err < 0) { -			put_page(page); +			folio_put(folio);  			return ERR_PTR(err);  		} -		page = wait_on_page_read(page); -		if (IS_ERR(page)) -			return page; +		folio_wait_locked(folio); +		if (!folio_test_uptodate(folio)) { +			folio_put(folio); +			return ERR_PTR(-EIO); +		} +  		goto out;  	} -	if (PageUptodate(page)) -		goto out; - -	/* -	 * Page is not up to date and may be locked due to one of the following -	 * case a: Page is being filled and the page lock is held -	 * case b: Read/write error clearing the page uptodate status -	 * case c: Truncation in progress (page locked) -	 * case d: Reclaim in progress -	 * -	 * Case a, the page will be up to date when the page is unlocked. -	 *    There is no need to serialise on the page lock here as the page -	 *    is pinned so the lock gives no additional protection. Even if the -	 *    page is truncated, the data is still valid if PageUptodate as -	 *    it's a race vs truncate race. -	 * Case b, the page will not be up to date -	 * Case c, the page may be truncated but in itself, the data may still -	 *    be valid after IO completes as it's a read vs truncate race. The -	 *    operation must restart if the page is not uptodate on unlock but -	 *    otherwise serialising on page lock to stabilise the mapping gives -	 *    no additional guarantees to the caller as the page lock is -	 *    released before return. -	 * Case d, similar to truncation. If reclaim holds the page lock, it -	 *    will be a race with remove_mapping that determines if the mapping -	 *    is valid on unlock but otherwise the data is valid and there is -	 *    no need to serialise with page lock. -	 * -	 * As the page lock gives no additional guarantee, we optimistically -	 * wait on the page to be unlocked and check if it's up to date and -	 * use the page if it is. Otherwise, the page lock is required to -	 * distinguish between the different cases. The motivation is that we -	 * avoid spurious serialisations and wakeups when multiple processes -	 * wait on the same page for IO to complete. -	 */ -	wait_on_page_locked(page); -	if (PageUptodate(page)) +	if (folio_test_uptodate(folio))  		goto out; -	/* Distinguish between all the cases under the safety of the lock */ -	lock_page(page); +	if (!folio_trylock(folio)) { +		folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); +		goto repeat; +	} -	/* Case c or d, restart the operation */ -	if (!page->mapping) { -		unlock_page(page); -		put_page(page); +	/* Folio was truncated from mapping */ +	if (!folio->mapping) { +		folio_unlock(folio); +		folio_put(folio);  		goto repeat;  	}  	/* Someone else locked and filled the page in a very small window */ -	if (PageUptodate(page)) { -		unlock_page(page); +	if (folio_test_uptodate(folio)) { +		folio_unlock(folio);  		goto out;  	} @@ -3533,16 +3449,16 @@ filler:  	 * Clear page error before actual read, PG_error will be  	 * set again if read page fails.  	 */ -	ClearPageError(page); +	folio_clear_error(folio);  	goto filler;  out: -	mark_page_accessed(page); -	return page; +	folio_mark_accessed(folio); +	return folio;  }  /** - * read_cache_page - read into page cache, fill it if needed + * read_cache_folio - read into page cache, fill it if needed   * @mapping:	the page's address_space   * @index:	the page index   * @filler:	function to perform the read @@ -3557,10 +3473,27 @@ out:   *   * Return: up to date page on success, ERR_PTR() on failure.   */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, +		filler_t filler, void *data) +{ +	return do_read_cache_folio(mapping, index, filler, data, +			mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, +		pgoff_t index, filler_t *filler, void *data, gfp_t gfp) +{ +	struct folio *folio; + +	folio = do_read_cache_folio(mapping, index, filler, data, gfp); +	if (IS_ERR(folio)) +		return &folio->page; +	return folio_file_page(folio, index); +} +  struct page *read_cache_page(struct address_space *mapping, -				pgoff_t index, -				int (*filler)(void *, struct page *), -				void *data) +				pgoff_t index, filler_t *filler, void *data)  {  	return do_read_cache_page(mapping, index, filler, data,  			mapping_gfp_mask(mapping)); @@ -3920,33 +3853,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)  EXPORT_SYMBOL(generic_file_write_iter);  /** - * try_to_release_page() - release old fs-specific metadata on a page - * - * @page: the page which the kernel is trying to free - * @gfp_mask: memory allocation flags (and I/O mode) + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode).   * - * The address_space is to try to release any data against the page - * (presumably at page->private). + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private).   * - * This may also be called if PG_fscache is set on a page, indicating that the - * page is known to the local caching routines. + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it.   * - * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS).   * - * Return: %1 if the release was successful, otherwise return zero. + * Return: %true if the release was successful, otherwise %false.   */ -int try_to_release_page(struct page *page, gfp_t gfp_mask) +bool filemap_release_folio(struct folio *folio, gfp_t gfp)  { -	struct address_space * const mapping = page->mapping; +	struct address_space * const mapping = folio->mapping; -	BUG_ON(!PageLocked(page)); -	if (PageWriteback(page)) -		return 0; +	BUG_ON(!folio_test_locked(folio)); +	if (folio_test_writeback(folio)) +		return false;  	if (mapping && mapping->a_ops->releasepage) -		return mapping->a_ops->releasepage(page, gfp_mask); -	return try_to_free_buffers(page); +		return mapping->a_ops->releasepage(&folio->page, gfp); +	return try_to_free_buffers(&folio->page);  } - -EXPORT_SYMBOL(try_to_release_page); +EXPORT_SYMBOL(filemap_release_folio); |