diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 372 |
1 files changed, 89 insertions, 283 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 99c49eeae71b..d5e7c2029d16 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping, freepage(page); if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); + page_ref_sub(page, thp_nr_pages(page)); VM_BUG_ON_PAGE(page_count(page) <= 0, page); } else { put_page(page); @@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping) || + if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; @@ -827,15 +827,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -static int __add_to_page_cache_locked(struct page *page, - struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask, - void **shadowp) +noinline int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp, + void **shadowp) { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; - void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp_mask); + error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; } + gfp &= GFP_RECLAIM_MASK; + do { + unsigned int order = xa_get_order(xas.xa, xas.xa_index); + void *entry, *old = NULL; + + if (order > thp_order(page)) + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), + order, gfp); xas_lock_irq(&xas); - old = xas_load(&xas); - if (old && !xa_is_value(old)) - xas_set_err(&xas, -EEXIST); + xas_for_each_conflict(&xas, entry) { + old = entry; + if (!xa_is_value(entry)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + } + + if (old) { + if (shadowp) + *shadowp = old; + /* entry may have been split before we acquired lock */ + order = xa_get_order(xas.xa, xas.xa_index); + if (order > thp_order(page)) { + xas_split(&xas, old, order); + xas_reset(&xas); + } + } + xas_store(&xas, page); if (xas_error(&xas)) goto unlock; - if (xa_is_value(old)) { + if (old) mapping->nrexceptional--; - if (shadowp) - *shadowp = old; - } mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { error = xas_error(&xas); @@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem * unlock_page - unlock a locked page * @page: the page * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Unlocks the page and wakes up sleepers in wait_on_page_locked(). * Also wakes sleepers in wait_on_page_writeback() because the wakeup * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. @@ -1645,19 +1665,19 @@ EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page cache index + * @index: The page cache index. * * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. + * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { - XA_STATE(xas, &mapping->i_pages, offset); + XA_STATE(xas, &mapping->i_pages, index); struct page *page; rcu_read_lock(); @@ -1685,7 +1705,6 @@ repeat: put_page(page); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1693,40 +1712,37 @@ out: } /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. @@ -1741,6 +1757,8 @@ EXPORT_SYMBOL(find_lock_entry); * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1781,12 +1799,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (fgp_flags & FGP_ACCESSED) @@ -1796,11 +1814,13 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { int err; - if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp_mask |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp_mask &= ~__GFP_FS; @@ -2179,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if (written && (iocb->ki_flags & IOCB_WAITQ)) + iocb->ki_flags |= IOCB_NOWAIT; + for (;;) { struct page *page; pgoff_t end_index; @@ -2568,8 +2596,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); struct file *fpin = NULL; - pgoff_t offset = vmf->pgoff; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ @@ -2580,8 +2608,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_readahead(mapping, ra, file, offset, - ra->ra_pages); + page_cache_sync_ra(&ractl, ra, ra->ra_pages); return fpin; } @@ -2601,10 +2628,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; - ra_submit(ra, mapping, file); + ractl._index = ra->start; + do_page_cache_ra(&ractl, ra->size, ra->async_size); return fpin; } @@ -2793,42 +2821,42 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *page; + struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); - xas_for_each(&xas, page, end_pgoff) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, head, end_pgoff) { + if (xas_retry(&xas, head)) continue; - if (xa_is_value(page)) + if (xa_is_value(head)) goto next; /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(page)) + if (PageLocked(head)) goto next; - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto next; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(head != xas_reload(&xas))) goto skip; - page = find_subpage(page, xas.xa_index); + page = find_subpage(head, xas.xa_index); - if (!PageUptodate(page) || + if (!PageUptodate(head) || PageReadahead(page) || PageHWPoison(page)) goto skip; - if (!trylock_page(page)) + if (!trylock_page(head)) goto skip; - if (page->mapping != mapping || !PageUptodate(page)) + if (head->mapping != mapping || !PageUptodate(head)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= max_idx) + if (xas.xa_index >= max_idx) goto unlock; if (mmap_miss > 0) @@ -2840,12 +2868,12 @@ void filemap_map_pages(struct vm_fault *vmf, last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, page)) goto unlock; - unlock_page(page); + unlock_page(head); goto next; unlock: - unlock_page(page); + unlock_page(head); skip: - put_page(page); + put_page(head); next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) @@ -2984,7 +3012,7 @@ filler: goto out; /* - * Page is not up to date and may be locked due one of the following + * Page is not up to date and may be locked due to one of the following * case a: Page is being filled and the page lock is held * case b: Read/write error clearing the page uptodate status * case c: Truncation in progress (page locked) @@ -3093,228 +3121,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -/* - * Don't operate on ranges the page cache doesn't support, and don't exceed the - * LFS limits. If pos is under the limit it becomes a short access. If it - * exceeds the limit we return -EFBIG. - */ -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - - *count = min(*count, max_size - pos); - - return 0; -} - -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - loff_t count; - int ret; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - /* FIXME: this is for backwards compatibility with 2.4 */ - if (iocb->ki_flags & IOCB_APPEND) - iocb->ki_pos = i_size_read(inode); - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - count = iov_iter_count(from); - ret = generic_write_check_limits(file, iocb->ki_pos, &count); - if (ret) - return ret; - - iov_iter_truncate(from, count); - return iov_iter_count(from); -} -EXPORT_SYMBOL(generic_write_checks); - -/* - * Performs necessary checks before doing a clone. - * - * Can adjust amount of bytes to clone via @req_count argument. - * Returns appropriate error code that caller should return or - * zero in case the clone should be allowed. - */ -int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - - -/* - * Performs common checks before doing a file copy/clone - * from @file_in to @file_out. - */ -int generic_file_rw_checks(struct file *file_in, struct file *file_out) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - - /* Don't copy dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - if (!(file_in->f_mode & FMODE_READ) || - !(file_out->f_mode & FMODE_WRITE) || - (file_out->f_flags & O_APPEND)) - return -EBADF; - - return 0; -} - -/* - * Performs necessary checks before doing a file copy - * - * Can adjust amount of bytes to copy via @req_count argument. - * Returns appropriate error code that caller should return or - * zero in case the copy should be allowed. - */ -int generic_copy_file_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t *req_count, unsigned int flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - uint64_t count = *req_count; - loff_t size_in; - int ret; - - ret = generic_file_rw_checks(file_in, file_out); - if (ret) - return ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EOVERFLOW; - - /* Shorten the copy to EOF */ - size_in = i_size_read(inode_in); - if (pos_in >= size_in) - count = 0; - else - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* Don't allow overlapped copying within the same file. */ - if (inode_in == inode_out && - pos_out + count > pos_in && - pos_out < pos_in + count) - return -EINVAL; - - *req_count = count; - return 0; -} - int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) |