diff options
Diffstat (limited to 'mm/readahead.c')
| -rw-r--r-- | mm/readahead.c | 256 | 
1 files changed, 228 insertions, 28 deletions
diff --git a/mm/readahead.c b/mm/readahead.c index 6ae5693de28c..d3a47546d17d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,111 @@   *		Initial version.   */ +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application.  Readahead only ever + * attempts to read pages that are not yet in the page cache.  If a + * page is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->readpage() will be requested. + * + * Readahead is triggered when an application read request (whether a + * systemcall or a page fault) finds that the requested page is not in + * the page cache, or that it is in the page cache and has the + * %PG_readahead flag set.  This flag indicates that the page was loaded + * as part of a previous read-ahead request and now that it has been + * accessed, it is time for the next read-ahead. + * + * Each readahead request is partly synchronous read, and partly async + * read-ahead.  This is reflected in the struct file_ra_state which + * contains ->size being to total number of pages, and ->async_size + * which is the number of pages in the async section.  The first page in + * this async section will have %PG_readahead set as a trigger for a + * subsequent read ahead.  Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all read ahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need to + * be determined: the start of the region, the size of the region, and + * the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache.  This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used.  NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages.  This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved.  Specifically: where the readahead + * was triggered by the %PG_readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead.  In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read.  This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general read ahead is accelerated at the start of the file, as + * reads from there are often sequential.  There are other minor + * adjustments to the read ahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation determines the readahead, to which any requested + * read size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation.  ->readahead() should normally initiate reads on all + * pages, but may fail to read any or all pages without causing an IO + * error.  The page cache reading code will issue a ->readpage() request + * for any page which ->readahead() does not provided, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_page() repeatedly to get + * each page from those prepared for read ahead.  It may fail to read a + * page by: + * + * * not calling readahead_page() sufficiently many times, effectively + *   ignoring some pages, as might be appropriate if the path to + *   storage is congested. + * + * * failing to actually submit a read request for a given page, + *   possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the page should be unlocked to indicate that + * the read attempt has failed.  In the first case the page will be + * unlocked by the caller. + * + * Those pages not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g.  memory or indexing information) to + * become available.  Pages in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * In this case it is best to use delete_from_page_cache() to remove the + * pages from the page cache as is automatically done for pages that + * were not fetched with readahead_page().  This will allow a + * subsequent synchronous read ahead request to try them again.  If they + * are left in the page cache, then they will be read individually using + * ->readpage(). + * + */ +  #include <linux/kernel.h>  #include <linux/dax.h>  #include <linux/gfp.h> @@ -51,7 +156,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,  		if (!trylock_page(page))  			BUG();  		page->mapping = mapping; -		do_invalidatepage(page, 0, PAGE_SIZE); +		folio_invalidate(page_folio(page), 0, PAGE_SIZE);  		page->mapping = NULL;  		unlock_page(page);  	} @@ -127,8 +232,17 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,  	if (aops->readahead) {  		aops->readahead(rac); -		/* Clean up the remaining pages */ +		/* +		 * Clean up the remaining pages.  The sizes in ->ra +		 * maybe be used to size next read-ahead, so make sure +		 * they accurately reflect what happened. +		 */  		while ((page = readahead_page(rac))) { +			rac->ra->size -= 1; +			if (rac->ra->async_size > 0) { +				rac->ra->async_size -= 1; +				delete_from_page_cache(page); +			}  			unlock_page(page);  			put_page(page);  		} @@ -148,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,  	blk_finish_plug(&plug); -	BUG_ON(!list_empty(pages)); +	BUG_ON(pages && !list_empty(pages));  	BUG_ON(readahead_count(rac));  out: @@ -196,9 +310,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,  	 * Preallocate as many pages as we will need.  	 */  	for (i = 0; i < nr_to_read; i++) { -		struct page *page = xa_load(&mapping->i_pages, index + i); +		struct folio *folio = xa_load(&mapping->i_pages, index + i); -		if (page && !xa_is_value(page)) { +		if (folio && !xa_is_value(folio)) {  			/*  			 * Page already present?  Kick off the current batch  			 * of contiguous pages before continuing with the @@ -212,21 +326,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,  			continue;  		} -		page = __page_cache_alloc(gfp_mask); -		if (!page) +		folio = filemap_alloc_folio(gfp_mask, 0); +		if (!folio)  			break;  		if (mapping->a_ops->readpages) { -			page->index = index + i; -			list_add(&page->lru, &page_pool); -		} else if (add_to_page_cache_lru(page, mapping, index + i, +			folio->index = index + i; +			list_add(&folio->lru, &page_pool); +		} else if (filemap_add_folio(mapping, folio, index + i,  					gfp_mask) < 0) { -			put_page(page); +			folio_put(folio);  			read_pages(ractl, &page_pool, true);  			i = ractl->_index + ractl->_nr_pages - index - 1;  			continue;  		}  		if (i == nr_to_read - lookahead_size) -			SetPageReadahead(page); +			folio_set_readahead(folio);  		ractl->_nr_pages++;  	} @@ -247,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);   * behaviour which would occur if page allocations are causing VM writeback.   * We really don't want to intermingle reads and writes like that.   */ -void do_page_cache_ra(struct readahead_control *ractl, +static void do_page_cache_ra(struct readahead_control *ractl,  		unsigned long nr_to_read, unsigned long lookahead_size)  {  	struct inode *inode = ractl->mapping->host; @@ -432,10 +546,102 @@ static int try_context_readahead(struct address_space *mapping,  }  /* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then, + * limit the maximum allocation order to PMD size.  I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER	HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER	8 +#endif + +static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, +		pgoff_t mark, unsigned int order, gfp_t gfp) +{ +	int err; +	struct folio *folio = filemap_alloc_folio(gfp, order); + +	if (!folio) +		return -ENOMEM; +	if (mark - index < (1UL << order)) +		folio_set_readahead(folio); +	err = filemap_add_folio(ractl->mapping, folio, index, gfp); +	if (err) +		folio_put(folio); +	else +		ractl->_nr_pages += 1UL << order; +	return err; +} + +void page_cache_ra_order(struct readahead_control *ractl, +		struct file_ra_state *ra, unsigned int new_order) +{ +	struct address_space *mapping = ractl->mapping; +	pgoff_t index = readahead_index(ractl); +	pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; +	pgoff_t mark = index + ra->size - ra->async_size; +	int err = 0; +	gfp_t gfp = readahead_gfp_mask(mapping); + +	if (!mapping_large_folio_support(mapping) || ra->size < 4) +		goto fallback; + +	limit = min(limit, index + ra->size - 1); + +	if (new_order < MAX_PAGECACHE_ORDER) { +		new_order += 2; +		if (new_order > MAX_PAGECACHE_ORDER) +			new_order = MAX_PAGECACHE_ORDER; +		while ((1 << new_order) > ra->size) +			new_order--; +	} + +	while (index <= limit) { +		unsigned int order = new_order; + +		/* Align with smaller pages if needed */ +		if (index & ((1UL << order) - 1)) { +			order = __ffs(index); +			if (order == 1) +				order = 0; +		} +		/* Don't allocate pages past EOF */ +		while (index + (1UL << order) - 1 > limit) { +			if (--order == 1) +				order = 0; +		} +		err = ra_alloc_folio(ractl, index, mark, order, gfp); +		if (err) +			break; +		index += 1UL << order; +	} + +	if (index > limit) { +		ra->size += index - limit - 1; +		ra->async_size += index - limit - 1; +	} + +	read_pages(ractl, NULL, false); + +	/* +	 * If there were already pages in the page cache, then we may have +	 * left some gaps.  Let the regular readahead code take care of this +	 * situation. +	 */ +	if (!err) +		return; +fallback: +	do_page_cache_ra(ractl, ra->size, ra->async_size); +} + +/*   * A minimal readahead algorithm for trivial sequential/random reads.   */  static void ondemand_readahead(struct readahead_control *ractl, -		bool hit_readahead_marker, unsigned long req_size) +		struct folio *folio, unsigned long req_size)  {  	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);  	struct file_ra_state *ra = ractl->ra; @@ -470,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl,  	}  	/* -	 * Hit a marked page without valid readahead state. +	 * Hit a marked folio without valid readahead state.  	 * E.g. interleaved reads.  	 * Query the pagecache for async_size, which normally equals to  	 * readahead size. Ramp it up and use it as the new readahead size.  	 */ -	if (hit_readahead_marker) { +	if (folio) {  		pgoff_t start;  		rcu_read_lock(); @@ -548,7 +754,7 @@ readit:  	}  	ractl->_index = ra->start; -	do_page_cache_ra(ractl, ra->size, ra->async_size); +	page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);  }  void page_cache_sync_ra(struct readahead_control *ractl, @@ -576,12 +782,12 @@ void page_cache_sync_ra(struct readahead_control *ractl,  	}  	/* do read-ahead */ -	ondemand_readahead(ractl, false, req_count); +	ondemand_readahead(ractl, NULL, req_count);  }  EXPORT_SYMBOL_GPL(page_cache_sync_ra);  void page_cache_async_ra(struct readahead_control *ractl, -		struct page *page, unsigned long req_count) +		struct folio *folio, unsigned long req_count)  {  	/* no read-ahead */  	if (!ractl->ra->ra_pages) @@ -590,22 +796,16 @@ void page_cache_async_ra(struct readahead_control *ractl,  	/*  	 * Same bit is used for PG_readahead and PG_reclaim.  	 */ -	if (PageWriteback(page)) +	if (folio_test_writeback(folio))  		return; -	ClearPageReadahead(page); - -	/* -	 * Defer asynchronous read-ahead on IO congestion. -	 */ -	if (inode_read_congested(ractl->mapping->host)) -		return; +	folio_clear_readahead(folio);  	if (blk_cgroup_congested())  		return;  	/* do read-ahead */ -	ondemand_readahead(ractl, true, req_count); +	ondemand_readahead(ractl, folio, req_count);  }  EXPORT_SYMBOL_GPL(page_cache_async_ra);  |