diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 396 | 
1 files changed, 235 insertions, 161 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..5de7633e1dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,9 +42,6 @@  #include <asm/mman.h> -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, -	loff_t offset, unsigned long nr_segs);  /*   * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,  /*   * Remove a page from the page cache and free it. Caller has to make   * sure the page is locked and that nobody else uses it - or that usage - * is safe.  The caller must hold a write_lock on the mapping's tree_lock. + * is safe.  The caller must hold the mapping's tree_lock.   */  void __remove_from_page_cache(struct page *page)  {  	struct address_space *mapping = page->mapping; -	mem_cgroup_uncharge_page(page); +	mem_cgroup_uncharge_cache_page(page);  	radix_tree_delete(&mapping->page_tree, page->index);  	page->mapping = NULL;  	mapping->nrpages--; @@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)  	BUG_ON(!PageLocked(page)); -	write_lock_irq(&mapping->tree_lock); +	spin_lock_irq(&mapping->tree_lock);  	__remove_from_page_cache(page); -	write_unlock_irq(&mapping->tree_lock); +	spin_unlock_irq(&mapping->tree_lock);  }  static int sync_page(void *word) @@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,  }  /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache   * @page:	page to add   * @mapping:	the page's address_space   * @offset:	page index   * @gfp_mask:	page allocation mode   * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked.   * This function does not add the page to the LRU.  The caller must do that.   */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping,  		pgoff_t offset, gfp_t gfp_mask)  { -	int error = mem_cgroup_cache_charge(page, current->mm, +	int error; + +	VM_BUG_ON(!PageLocked(page)); + +	error = mem_cgroup_cache_charge(page, current->mm,  					gfp_mask & ~__GFP_HIGHMEM);  	if (error)  		goto out;  	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);  	if (error == 0) { -		write_lock_irq(&mapping->tree_lock); +		page_cache_get(page); +		page->mapping = mapping; +		page->index = offset; + +		spin_lock_irq(&mapping->tree_lock);  		error = radix_tree_insert(&mapping->page_tree, offset, page); -		if (!error) { -			page_cache_get(page); -			SetPageLocked(page); -			page->mapping = mapping; -			page->index = offset; +		if (likely(!error)) {  			mapping->nrpages++;  			__inc_zone_page_state(page, NR_FILE_PAGES); -		} else -			mem_cgroup_uncharge_page(page); +		} else { +			page->mapping = NULL; +			mem_cgroup_uncharge_cache_page(page); +			page_cache_release(page); +		} -		write_unlock_irq(&mapping->tree_lock); +		spin_unlock_irq(&mapping->tree_lock);  		radix_tree_preload_end();  	} else -		mem_cgroup_uncharge_page(page); +		mem_cgroup_uncharge_cache_page(page);  out:  	return error;  } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked);  int add_to_page_cache_lru(struct page *page, struct address_space *mapping,  				pgoff_t offset, gfp_t gfp_mask) @@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page)   * Is there a pagecache struct page at the given (mapping, offset) tuple?   * If yes, increment its refcount and return it; if no, return NULL.   */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset)  { +	void **pagep;  	struct page *page; -	read_lock_irq(&mapping->tree_lock); -	page = radix_tree_lookup(&mapping->page_tree, offset); -	if (page) -		page_cache_get(page); -	read_unlock_irq(&mapping->tree_lock); +	rcu_read_lock(); +repeat: +	page = NULL; +	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); +	if (pagep) { +		page = radix_tree_deref_slot(pagep); +		if (unlikely(!page || page == RADIX_TREE_RETRY)) +			goto repeat; + +		if (!page_cache_get_speculative(page)) +			goto repeat; + +		/* +		 * Has the page moved? +		 * This is part of the lockless pagecache protocol. See +		 * include/linux/pagemap.h for details. +		 */ +		if (unlikely(page != *pagep)) { +			page_cache_release(page); +			goto repeat; +		} +	} +	rcu_read_unlock(); +  	return page;  }  EXPORT_SYMBOL(find_get_page); @@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);   *   * Returns zero if the page was not present. find_lock_page() may sleep.   */ -struct page *find_lock_page(struct address_space *mapping, -				pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)  {  	struct page *page;  repeat: -	read_lock_irq(&mapping->tree_lock); -	page = radix_tree_lookup(&mapping->page_tree, offset); +	page = find_get_page(mapping, offset);  	if (page) { -		page_cache_get(page); -		if (TestSetPageLocked(page)) { -			read_unlock_irq(&mapping->tree_lock); -			__lock_page(page); - -			/* Has the page been truncated while we slept? */ -			if (unlikely(page->mapping != mapping)) { -				unlock_page(page); -				page_cache_release(page); -				goto repeat; -			} -			VM_BUG_ON(page->index != offset); -			goto out; +		lock_page(page); +		/* Has the page been truncated? */ +		if (unlikely(page->mapping != mapping)) { +			unlock_page(page); +			page_cache_release(page); +			goto repeat;  		} +		VM_BUG_ON(page->index != offset);  	} -	read_unlock_irq(&mapping->tree_lock); -out:  	return page;  }  EXPORT_SYMBOL(find_lock_page); @@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,  {  	unsigned int i;  	unsigned int ret; +	unsigned int nr_found; + +	rcu_read_lock(); +restart: +	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, +				(void ***)pages, start, nr_pages); +	ret = 0; +	for (i = 0; i < nr_found; i++) { +		struct page *page; +repeat: +		page = radix_tree_deref_slot((void **)pages[i]); +		if (unlikely(!page)) +			continue; +		/* +		 * this can only trigger if nr_found == 1, making livelock +		 * a non issue. +		 */ +		if (unlikely(page == RADIX_TREE_RETRY)) +			goto restart; + +		if (!page_cache_get_speculative(page)) +			goto repeat; + +		/* Has the page moved? */ +		if (unlikely(page != *((void **)pages[i]))) { +			page_cache_release(page); +			goto repeat; +		} -	read_lock_irq(&mapping->tree_lock); -	ret = radix_tree_gang_lookup(&mapping->page_tree, -				(void **)pages, start, nr_pages); -	for (i = 0; i < ret; i++) -		page_cache_get(pages[i]); -	read_unlock_irq(&mapping->tree_lock); +		pages[ret] = page; +		ret++; +	} +	rcu_read_unlock();  	return ret;  } @@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,  {  	unsigned int i;  	unsigned int ret; +	unsigned int nr_found; + +	rcu_read_lock(); +restart: +	nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, +				(void ***)pages, index, nr_pages); +	ret = 0; +	for (i = 0; i < nr_found; i++) { +		struct page *page; +repeat: +		page = radix_tree_deref_slot((void **)pages[i]); +		if (unlikely(!page)) +			continue; +		/* +		 * this can only trigger if nr_found == 1, making livelock +		 * a non issue. +		 */ +		if (unlikely(page == RADIX_TREE_RETRY)) +			goto restart; -	read_lock_irq(&mapping->tree_lock); -	ret = radix_tree_gang_lookup(&mapping->page_tree, -				(void **)pages, index, nr_pages); -	for (i = 0; i < ret; i++) { -		if (pages[i]->mapping == NULL || pages[i]->index != index) +		if (page->mapping == NULL || page->index != index)  			break; -		page_cache_get(pages[i]); +		if (!page_cache_get_speculative(page)) +			goto repeat; + +		/* Has the page moved? */ +		if (unlikely(page != *((void **)pages[i]))) { +			page_cache_release(page); +			goto repeat; +		} + +		pages[ret] = page; +		ret++;  		index++;  	} -	read_unlock_irq(&mapping->tree_lock); -	return i; +	rcu_read_unlock(); +	return ret;  }  EXPORT_SYMBOL(find_get_pages_contig); @@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,  {  	unsigned int i;  	unsigned int ret; +	unsigned int nr_found; + +	rcu_read_lock(); +restart: +	nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, +				(void ***)pages, *index, nr_pages, tag); +	ret = 0; +	for (i = 0; i < nr_found; i++) { +		struct page *page; +repeat: +		page = radix_tree_deref_slot((void **)pages[i]); +		if (unlikely(!page)) +			continue; +		/* +		 * this can only trigger if nr_found == 1, making livelock +		 * a non issue. +		 */ +		if (unlikely(page == RADIX_TREE_RETRY)) +			goto restart; + +		if (!page_cache_get_speculative(page)) +			goto repeat; + +		/* Has the page moved? */ +		if (unlikely(page != *((void **)pages[i]))) { +			page_cache_release(page); +			goto repeat; +		} + +		pages[ret] = page; +		ret++; +	} +	rcu_read_unlock(); -	read_lock_irq(&mapping->tree_lock); -	ret = radix_tree_gang_lookup_tag(&mapping->page_tree, -				(void **)pages, *index, nr_pages, tag); -	for (i = 0; i < ret; i++) -		page_cache_get(pages[i]);  	if (ret)  		*index = pages[ret - 1]->index + 1; -	read_unlock_irq(&mapping->tree_lock); +  	return ret;  }  EXPORT_SYMBOL(find_get_pages_tag); @@ -1200,42 +1290,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,  		mapping = filp->f_mapping;  		inode = mapping->host; -		retval = 0;  		if (!count)  			goto out; /* skip atime */  		size = i_size_read(inode);  		if (pos < size) { -			retval = generic_file_direct_IO(READ, iocb, -						iov, pos, nr_segs); +			retval = filemap_write_and_wait(mapping); +			if (!retval) { +				retval = mapping->a_ops->direct_IO(READ, iocb, +							iov, pos, nr_segs); +			}  			if (retval > 0)  				*ppos = pos + retval; -		} -		if (likely(retval != 0)) { -			file_accessed(filp); -			goto out; +			if (retval) { +				file_accessed(filp); +				goto out; +			}  		}  	} -	retval = 0; -	if (count) { -		for (seg = 0; seg < nr_segs; seg++) { -			read_descriptor_t desc; +	for (seg = 0; seg < nr_segs; seg++) { +		read_descriptor_t desc; -			desc.written = 0; -			desc.arg.buf = iov[seg].iov_base; -			desc.count = iov[seg].iov_len; -			if (desc.count == 0) -				continue; -			desc.error = 0; -			do_generic_file_read(filp,ppos,&desc,file_read_actor); -			retval += desc.written; -			if (desc.error) { -				retval = retval ?: desc.error; -				break; -			} -			if (desc.count > 0) -				break; +		desc.written = 0; +		desc.arg.buf = iov[seg].iov_base; +		desc.count = iov[seg].iov_len; +		if (desc.count == 0) +			continue; +		desc.error = 0; +		do_generic_file_read(filp, ppos, &desc, file_read_actor); +		retval += desc.written; +		if (desc.error) { +			retval = retval ?: desc.error; +			break;  		} +		if (desc.count > 0) +			break;  	}  out:  	return retval; @@ -1669,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill)  	return notify_change(dentry, &newattrs);  } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file)  { +	struct dentry *dentry = file->f_path.dentry;  	int killsuid = should_remove_suid(dentry);  	int killpriv = security_inode_need_killpriv(dentry);  	int error = 0; @@ -1684,7 +1774,7 @@ int remove_suid(struct dentry *dentry)  	return error;  } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid);  static size_t __iovec_copy_from_user_inatomic(char *vaddr,  			const struct iovec *iov, size_t base, size_t bytes) @@ -2004,11 +2094,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,  	struct address_space *mapping = file->f_mapping;  	struct inode	*inode = mapping->host;  	ssize_t		written; +	size_t		write_len; +	pgoff_t		end;  	if (count != ocount)  		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); -	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); +	/* +	 * Unmap all mmappings of the file up-front. +	 * +	 * This will cause any pte dirty bits to be propagated into the +	 * pageframes for the subsequent filemap_write_and_wait(). +	 */ +	write_len = iov_length(iov, *nr_segs); +	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; +	if (mapping_mapped(mapping)) +		unmap_mapping_range(mapping, pos, write_len, 0); + +	written = filemap_write_and_wait(mapping); +	if (written) +		goto out; + +	/* +	 * After a write we want buffered reads to be sure to go to disk to get +	 * the new data.  We invalidate clean cached page from the region we're +	 * about to write.  We do this *before* the write so that we can return +	 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). +	 */ +	if (mapping->nrpages) { +		written = invalidate_inode_pages2_range(mapping, +					pos >> PAGE_CACHE_SHIFT, end); +		if (written) +			goto out; +	} + +	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + +	/* +	 * Finally, try again to invalidate clean pages which might have been +	 * cached by non-direct readahead, or faulted in by get_user_pages() +	 * if the source of the write was an mmap'ed region of the file +	 * we're writing.  Either one is a pretty crazy thing to do, +	 * so we don't support it 100%.  If this invalidation +	 * fails, tough, the write still worked... +	 */ +	if (mapping->nrpages) { +		invalidate_inode_pages2_range(mapping, +					      pos >> PAGE_CACHE_SHIFT, end); +	} +  	if (written > 0) {  		loff_t end = pos + written;  		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -2024,6 +2158,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,  	 * i_mutex is held, which protects generic_osync_inode() from  	 * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.  	 */ +out:  	if ((written >= 0 || written == -EIOCBQUEUED) &&  	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {  		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); @@ -2395,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,  	if (count == 0)  		goto out; -	err = remove_suid(file->f_path.dentry); +	err = file_remove_suid(file);  	if (err)  		goto out; @@ -2511,66 +2646,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  }  EXPORT_SYMBOL(generic_file_aio_write); -/* - * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, -	loff_t offset, unsigned long nr_segs) -{ -	struct file *file = iocb->ki_filp; -	struct address_space *mapping = file->f_mapping; -	ssize_t retval; -	size_t write_len; -	pgoff_t end = 0; /* silence gcc */ - -	/* -	 * If it's a write, unmap all mmappings of the file up-front.  This -	 * will cause any pte dirty bits to be propagated into the pageframes -	 * for the subsequent filemap_write_and_wait(). -	 */ -	if (rw == WRITE) { -		write_len = iov_length(iov, nr_segs); -		end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; -	       	if (mapping_mapped(mapping)) -			unmap_mapping_range(mapping, offset, write_len, 0); -	} - -	retval = filemap_write_and_wait(mapping); -	if (retval) -		goto out; - -	/* -	 * After a write we want buffered reads to be sure to go to disk to get -	 * the new data.  We invalidate clean cached page from the region we're -	 * about to write.  We do this *before* the write so that we can return -	 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). -	 */ -	if (rw == WRITE && mapping->nrpages) { -		retval = invalidate_inode_pages2_range(mapping, -					offset >> PAGE_CACHE_SHIFT, end); -		if (retval) -			goto out; -	} - -	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - -	/* -	 * Finally, try again to invalidate clean pages which might have been -	 * cached by non-direct readahead, or faulted in by get_user_pages() -	 * if the source of the write was an mmap'ed region of the file -	 * we're writing.  Either one is a pretty crazy thing to do, -	 * so we don't support it 100%.  If this invalidation -	 * fails, tough, the write still worked... -	 */ -	if (rw == WRITE && mapping->nrpages) { -		invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); -	} -out: -	return retval; -} -  /**   * try_to_release_page() - release old fs-specific metadata on a page   * @@ -2582,9 +2657,8 @@ out:   * Otherwise return zero.   *   * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).   * - * NOTE: @gfp_mask may go away, and this function may become non-blocking.   */  int try_to_release_page(struct page *page, gfp_t gfp_mask)  {  |