aboutsummaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c372
1 files changed, 89 insertions, 283 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 99c49eeae71b..d5e7c2029d16 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping) ||
+ if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
@@ -827,15 +827,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -1645,19 +1665,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
@@ -1685,7 +1705,6 @@ repeat:
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1693,40 +1712,37 @@ out:
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
@@ -1741,6 +1757,8 @@ EXPORT_SYMBOL(find_lock_entry);
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1781,12 +1799,12 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1796,11 +1814,13 @@ repeat:
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
@@ -2179,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if (written && (iocb->ki_flags & IOCB_WAITQ))
+ iocb->ki_flags |= IOCB_NOWAIT;
+
for (;;) {
struct page *page;
pgoff_t end_index;
@@ -2568,8 +2596,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2608,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2601,10 +2628,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2793,42 +2821,42 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
@@ -2840,12 +2868,12 @@ void filemap_map_pages(struct vm_fault *vmf,
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
@@ -2984,7 +3012,7 @@ filler:
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
@@ -3093,228 +3121,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page_gfp);
-/*
- * Don't operate on ranges the page cache doesn't support, and don't exceed the
- * LFS limits. If pos is under the limit it becomes a short access. If it
- * exceeds the limit we return -EFBIG.
- */
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
-
- *count = min(*count, max_size - pos);
-
- return 0;
-}
-
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
-
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
- if (!iov_iter_count(from))
- return 0;
-
- /* FIXME: this is for backwards compatibility with 2.4 */
- if (iocb->ki_flags & IOCB_APPEND)
- iocb->ki_pos = i_size_read(inode);
-
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
- return -EINVAL;
-
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
- if (ret)
- return ret;
-
- iov_iter_truncate(from, count);
- return iov_iter_count(from);
-}
-EXPORT_SYMBOL(generic_write_checks);
-
-/*
- * Performs necessary checks before doing a clone.
- *
- * Can adjust amount of bytes to clone via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the clone should be allowed.
- */
-int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-
-/*
- * Performs common checks before doing a file copy/clone
- * from @file_in to @file_out.
- */
-int generic_file_rw_checks(struct file *file_in, struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
-
- /* Don't copy dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- if (!(file_in->f_mode & FMODE_READ) ||
- !(file_out->f_mode & FMODE_WRITE) ||
- (file_out->f_flags & O_APPEND))
- return -EBADF;
-
- return 0;
-}
-
-/*
- * Performs necessary checks before doing a file copy
- *
- * Can adjust amount of bytes to copy via @req_count argument.
- * Returns appropriate error code that caller should return or
- * zero in case the copy should be allowed.
- */
-int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t *req_count, unsigned int flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- uint64_t count = *req_count;
- loff_t size_in;
- int ret;
-
- ret = generic_file_rw_checks(file_in, file_out);
- if (ret)
- return ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EOVERFLOW;
-
- /* Shorten the copy to EOF */
- size_in = i_size_read(inode_in);
- if (pos_in >= size_in)
- count = 0;
- else
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /* Don't allow overlapped copying within the same file. */
- if (inode_in == inode_out &&
- pos_out + count > pos_in &&
- pos_out < pos_in + count)
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)