diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 300 |
1 files changed, 227 insertions, 73 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc70..2723104cc06a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,8 @@ #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> +#include <linux/pipe_fs_i.h> +#include <linux/splice.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -97,7 +99,7 @@ * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem - * ->anon_vma.lock (vma_adjust) + * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) @@ -470,7 +472,7 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - struct page *page; + struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -479,11 +481,11 @@ bool filemap_range_has_page(struct address_space *mapping, rcu_read_lock(); for (;;) { - page = xas_find(&xas, max); - if (xas_retry(&xas, page)) + folio = xas_find(&xas, max); + if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to @@ -494,7 +496,7 @@ bool filemap_range_has_page(struct address_space *mapping, } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@ -503,25 +505,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned nr_folios; + + folio_batch_init(&fbatch); - pagevec_init(&pvec); while (index <= end) { unsigned i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK); - if (!nr_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + PAGECACHE_TAG_WRITEBACK, &fbatch); + + if (!nr_folios) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - wait_on_page_writeback(page); - ClearPageError(page); + folio_wait_writeback(folio); + folio_clear_error(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -2282,64 +2286,58 @@ out: EXPORT_SYMBOL(filemap_get_folios_contig); /** - * find_get_pages_range_tag - Find and return head pages matching @tag. - * @mapping: the address_space to search - * @index: the starting page index - * @end: The final page index (inclusive) - * @tag: the tag index - * @nr_pages: the maximum number of pages - * @pages: where the resulting pages are placed + * filemap_get_folios_tag - Get a batch of folios matching @tag + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @tag: The tag index + * @fbatch: The batch to fill * - * Like find_get_pages_range(), except we only return head pages which are - * tagged with @tag. @index is updated to the index immediately after the - * last page we return, ready for the next iteration. + * Same as filemap_get_folios(), but only returning folios tagged with @tag. * - * Return: the number of pages which were found. + * Return: The number of folios found. + * Also update @start to index the next folio for traversal. */ -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, *index); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; - unsigned ret = 0; - - if (unlikely(!nr_pages)) - return 0; rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict - * a page we saw tagged. Skip over it. + * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); - pages[ret] = &folio->page; - if (++ret == nr_pages) { - *index = folio->index + folio_nr_pages(folio); + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; goto out; } } - /* - * We come here when we got to @end. We take care to not overflow the - * index @index as it confuses some of the callers. This breaks the - * iteration when there is a page at index -1 but that is already - * broken anyway. + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broke anyway. */ if (end == (pgoff_t)-1) - *index = (pgoff_t)-1; + *start = (pgoff_t)-1; else - *index = end + 1; + *start = end + 1; out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); } -EXPORT_SYMBOL(find_get_pages_range_tag); +EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail @@ -2440,21 +2438,19 @@ static int filemap_read_folio(struct file *file, filler_t filler, } static bool filemap_range_uptodate(struct address_space *mapping, - loff_t pos, struct iov_iter *iter, struct folio *folio) + loff_t pos, size_t count, struct folio *folio, + bool need_uptodate) { - int count; - if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ - if (iov_iter_is_pipe(iter)) + if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; - count = iter->count; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; @@ -2466,8 +2462,8 @@ static bool filemap_range_uptodate(struct address_space *mapping, } static int filemap_update_page(struct kiocb *iocb, - struct address_space *mapping, struct iov_iter *iter, - struct folio *folio) + struct address_space *mapping, size_t count, + struct folio *folio, bool need_uptodate) { int error; @@ -2501,7 +2497,8 @@ static int filemap_update_page(struct kiocb *iocb, goto unlock; error = 0; - if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) + if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, + need_uptodate)) goto unlock; error = -EAGAIN; @@ -2577,8 +2574,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, return 0; } -static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct folio_batch *fbatch) +static int filemap_get_pages(struct kiocb *iocb, size_t count, + struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; @@ -2588,18 +2585,19 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio; int err = 0; - last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); + /* "last_index" is the index of the page beyond the end of the read */ + last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - filemap_get_read_batch(mapping, index, last_index, fbatch); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) @@ -2621,7 +2619,8 @@ retry: if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, folio); + err = filemap_update_page(iocb, mapping, count, folio, + need_uptodate); if (err) goto err; } @@ -2691,7 +2690,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter, &fbatch); + error = filemap_get_pages(iocb, iter->count, &fbatch, + iov_iter_is_pipe(iter)); if (error < 0) break; @@ -2841,6 +2841,134 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +/* + * Splice subpages from a folio into a pipe. + */ +size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, + struct folio *folio, loff_t fpos, size_t size) +{ + struct page *page; + size_t spliced = 0, offset = offset_in_folio(folio, fpos); + + page = folio_page(folio, offset / PAGE_SIZE); + size = min(size, folio_size(folio) - offset); + offset %= PAGE_SIZE; + + while (spliced < size && + !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); + + *buf = (struct pipe_buffer) { + .ops = &page_cache_pipe_buf_ops, + .page = page, + .offset = offset, + .len = part, + }; + folio_get(folio); + pipe->head++; + page++; + spliced += part; + offset = 0; + } + + return spliced; +} + +/* + * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into + * a pipe. + */ +ssize_t filemap_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct folio_batch fbatch; + struct kiocb iocb; + size_t total_spliced = 0, used, npages; + loff_t isize, end_offset; + bool writably_mapped; + int i, error = 0; + + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + folio_batch_init(&fbatch); + + do { + cond_resched(); + + if (*ppos >= i_size_read(file_inode(in))) + break; + + iocb.ki_pos = *ppos; + error = filemap_get_pages(&iocb, len, &fbatch, true); + if (error < 0) + break; + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(file_inode(in)); + if (unlikely(*ppos >= isize)) + break; + end_offset = min_t(loff_t, isize, *ppos + len); + + /* + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: + */ + writably_mapped = mapping_writably_mapped(in->f_mapping); + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t n; + + if (folio_pos(folio) >= end_offset) + goto out; + folio_mark_accessed(folio); + + /* + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. + */ + if (writably_mapped) + flush_dcache_folio(folio); + + n = min_t(loff_t, len, isize - *ppos); + n = splice_folio_into_pipe(pipe, folio, *ppos, n); + if (!n) + goto out; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + goto out; + } + + folio_batch_release(&fbatch); + } while (len); + +out: + folio_batch_release(&fbatch); + file_accessed(in); + + return total_spliced ? total_spliced : error; +} +EXPORT_SYMBOL(filemap_splice_read); + static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) @@ -3263,22 +3391,24 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) +static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, + pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } - if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { + struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ - unlock_page(page); + folio_unlock(folio); return true; } } @@ -3288,8 +3418,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } @@ -3372,7 +3502,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, &folio->page)) { + if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3587,6 +3717,30 @@ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, } EXPORT_SYMBOL(read_cache_folio); +/** + * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. + * @mapping: The address_space for the folio. + * @index: The index that the allocated folio will contain. + * @gfp: The page allocator flags to use if allocating. + * + * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with + * any new memory allocations done using the specified allocation flags. + * + * The most likely error from this function is EIO, but ENOMEM is + * possible and so is EINTR. If ->read_folio returns another error, + * that will be returned to the caller. + * + * The function expects mapping->invalidate_lock to be already held. + * + * Return: Uptodate folio on success, ERR_PTR() on failure. + */ +struct folio *mapping_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return do_read_cache_folio(mapping, index, NULL, NULL, gfp); +} +EXPORT_SYMBOL(mapping_read_folio_gfp); + static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { |