diff options
Diffstat (limited to 'mm')
52 files changed, 2802 insertions, 1339 deletions
| diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..46ef77d5c332 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP  config HAVE_MEMBLOCK_PHYS_MAP  	bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP  	bool  config ARCH_DISCARD_MEMBLOCK @@ -149,32 +149,6 @@ config NO_BOOTMEM  config MEMORY_ISOLATION  	bool -config MOVABLE_NODE -	bool "Enable to assign a node which has only movable memory" -	depends on HAVE_MEMBLOCK -	depends on NO_BOOTMEM -	depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG -	depends on NUMA -	default n -	help -	  Allow a node to have only movable memory.  Pages used by the kernel, -	  such as direct mapping pages cannot be migrated.  So the corresponding -	  memory device cannot be hotplugged.  This option allows the following -	  two things: -	  - When the system is booting, node full of hotpluggable memory can -	  be arranged to have only movable memory so that the whole node can -	  be hot-removed. (need movable_node boot option specified). -	  - After the system is up, the option allows users to online all the -	  memory of a node as movable memory so that the whole node can be -	  hot-removed. - -	  Users who don't use the memory hotplug feature are fine with this -	  option on since they don't specify movable_node boot option or they -	  don't online memory as movable. - -	  Say Y here if you want to hotplug a whole node. -	  Say N here if you want kernel to use memory on all nodes evenly. -  #  # Only be set on architectures that have completely implemented memory hotplug  # feature. If you are not sure, don't touch it. @@ -446,6 +420,18 @@ choice  	  benefit.  endchoice +config ARCH_WANTS_THP_SWAP +       def_bool n + +config THP_SWAP +	def_bool y +	depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP +	help +	  Swap transparent huge pages in one piece, without splitting. +	  XXX: For now this only does clustered swap space allocation. + +	  For selection by architectures with reasonable THP sizes. +  config	TRANSPARENT_HUGE_PAGECACHE  	def_bool y  	depends on TRANSPARENT_HUGEPAGE @@ -683,12 +669,16 @@ config IDLE_PAGE_TRACKING  	  See Documentation/vm/idle_page_tracking.txt for more details. +# arch_add_memory() comprehends device memory +config ARCH_HAS_ZONE_DEVICE +	bool +  config ZONE_DEVICE  	bool "Device memory (pmem, etc...) hotplug support"  	depends on MEMORY_HOTPLUG  	depends on MEMORY_HOTREMOVE  	depends on SPARSEMEM_VMEMMAP -	depends on X86_64 #arch_add_memory() comprehends device memory +	depends on ARCH_HAS_ZONE_DEVICE  	help  	  Device memory hotplug support allows for establishing pmem, @@ -706,3 +696,11 @@ config ARCH_USES_HIGH_VMA_FLAGS  	bool  config ARCH_HAS_PKEYS  	bool + +config PERCPU_STATS +	bool "Collect percpu memory statistics" +	default n +	help +	  This feature collects and exposes statistics via debugfs. The +	  information includes global and per chunk statistics, which can +	  be used to help understand percpu memory usage. diff --git a/mm/Makefile b/mm/Makefile index 026f6a828a50..411bd24d4a7c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o  obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o  obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/cleancache.c b/mm/cleancache.c index ba5d8f3e6d68..f7b9fdc79d97 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb)  	int pool_id = CLEANCACHE_NO_BACKEND_SHARED;  	if (cleancache_ops) { -		pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); +		pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);  		if (pool_id < 0)  			pool_id = CLEANCACHE_NO_POOL;  	} diff --git a/mm/compaction.c b/mm/compaction.c index 613c59e928cb..fb548e4c7bd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone)  		cond_resched(); -		if (!pfn_valid(pfn)) +		page = pfn_to_online_page(pfn); +		if (!page)  			continue; - -		page = pfn_to_page(pfn);  		if (zone != page_zone(page))  			continue; diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..2e906ef52143 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping)  }  EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping:           address space within which to check + * @start_byte:        offset in bytes where the range starts + * @end_byte:          offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, +			   loff_t start_byte, loff_t end_byte) +{ +	pgoff_t index = start_byte >> PAGE_SHIFT; +	pgoff_t end = end_byte >> PAGE_SHIFT; +	struct pagevec pvec; +	bool ret; + +	if (end_byte < start_byte) +		return false; + +	if (mapping->nrpages == 0) +		return false; + +	pagevec_init(&pvec, 0); +	if (!pagevec_lookup(&pvec, mapping, index, 1)) +		return false; +	ret = (pvec.pages[0]->index <= end); +	pagevec_release(&pvec); +	return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); +  static int __filemap_fdatawait_range(struct address_space *mapping,  				     loff_t start_byte, loff_t end_byte)  { @@ -768,10 +800,10 @@ struct wait_page_key {  struct wait_page_queue {  	struct page *page;  	int bit_nr; -	wait_queue_t wait; +	wait_queue_entry_t wait;  }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)  {  	struct wait_page_key *key = arg;  	struct wait_page_queue *wait_page @@ -834,7 +866,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,  		struct page *page, int bit_nr, int state, bool lock)  {  	struct wait_page_queue wait_page; -	wait_queue_t *wait = &wait_page.wait; +	wait_queue_entry_t *wait = &wait_page.wait;  	int ret = 0;  	init_wait(wait); @@ -845,9 +877,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,  	for (;;) {  		spin_lock_irq(&q->lock); -		if (likely(list_empty(&wait->task_list))) { +		if (likely(list_empty(&wait->entry))) {  			if (lock) -				__add_wait_queue_tail_exclusive(q, wait); +				__add_wait_queue_entry_tail_exclusive(q, wait);  			else  				__add_wait_queue(q, wait);  			SetPageWaiters(page); @@ -907,7 +939,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)   *   * Add an arbitrary @waiter to the wait queue for the nominated @page.   */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)  {  	wait_queue_head_t *q = page_waitqueue(page);  	unsigned long flags; @@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)  		loff_t size;  		size = i_size_read(inode); -		retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, -					iocb->ki_pos + count - 1); -		if (retval < 0) -			goto out; +		if (iocb->ki_flags & IOCB_NOWAIT) { +			if (filemap_range_has_page(mapping, iocb->ki_pos, +						   iocb->ki_pos + count - 1)) +				return -EAGAIN; +		} else { +			retval = filemap_write_and_wait_range(mapping, +						iocb->ki_pos, +					        iocb->ki_pos + count - 1); +			if (retval < 0) +				goto out; +		}  		file_accessed(file); @@ -2226,7 +2265,7 @@ int filemap_fault(struct vm_fault *vmf)  		/* No page in the page cache at all */  		do_sync_mmap_readahead(vmf->vma, ra, file, offset);  		count_vm_event(PGMAJFAULT); -		mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); +		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);  		ret = VM_FAULT_MAJOR;  retry_find:  		page = find_get_page(mapping, offset); @@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)  	pos = iocb->ki_pos; +	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) +		return -EINVAL; +  	if (limit != RLIM_INFINITY) {  		if (iocb->ki_pos >= limit) {  			send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)  	write_len = iov_iter_count(from);  	end = (pos + write_len - 1) >> PAGE_SHIFT; -	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); -	if (written) -		goto out; +	if (iocb->ki_flags & IOCB_NOWAIT) { +		/* If there are pages to writeback, return */ +		if (filemap_range_has_page(inode->i_mapping, pos, +					   pos + iov_iter_count(from))) +			return -EAGAIN; +	} else { +		written = filemap_write_and_wait_range(mapping, pos, +							pos + write_len - 1); +		if (written) +			goto out; +	}  	/*  	 * After a write we want buffered reads to be sure to go to disk to get @@ -208,72 +208,28 @@ no_page:  	return no_page_table(vma, flags);  } -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in <linux/mm.h> - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, -			      unsigned long address, unsigned int flags, -			      unsigned int *page_mask) +static struct page *follow_pmd_mask(struct vm_area_struct *vma, +				    unsigned long address, pud_t *pudp, +				    unsigned int flags, unsigned int *page_mask)  { -	pgd_t *pgd; -	p4d_t *p4d; -	pud_t *pud;  	pmd_t *pmd;  	spinlock_t *ptl;  	struct page *page;  	struct mm_struct *mm = vma->vm_mm; -	*page_mask = 0; - -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE); -	if (!IS_ERR(page)) { -		BUG_ON(flags & FOLL_GET); -		return page; -	} - -	pgd = pgd_offset(mm, address); -	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) -		return no_page_table(vma, flags); -	p4d = p4d_offset(pgd, address); -	if (p4d_none(*p4d)) -		return no_page_table(vma, flags); -	BUILD_BUG_ON(p4d_huge(*p4d)); -	if (unlikely(p4d_bad(*p4d))) -		return no_page_table(vma, flags); -	pud = pud_offset(p4d, address); -	if (pud_none(*pud)) +	pmd = pmd_offset(pudp, address); +	if (pmd_none(*pmd))  		return no_page_table(vma, flags); -	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { -		page = follow_huge_pud(mm, address, pud, flags); +	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { +		page = follow_huge_pmd(mm, address, pmd, flags);  		if (page)  			return page;  		return no_page_table(vma, flags);  	} -	if (pud_devmap(*pud)) { -		ptl = pud_lock(mm, pud); -		page = follow_devmap_pud(vma, address, pud, flags); -		spin_unlock(ptl); -		if (page) -			return page; -	} -	if (unlikely(pud_bad(*pud))) -		return no_page_table(vma, flags); - -	pmd = pmd_offset(pud, address); -	if (pmd_none(*pmd)) -		return no_page_table(vma, flags); -	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { -		page = follow_huge_pmd(mm, address, pmd, flags); +	if (is_hugepd(__hugepd(pmd_val(*pmd)))) { +		page = follow_huge_pd(vma, address, +				      __hugepd(pmd_val(*pmd)), flags, +				      PMD_SHIFT);  		if (page)  			return page;  		return no_page_table(vma, flags); @@ -319,13 +275,131 @@ struct page *follow_page_mask(struct vm_area_struct *vma,  		return ret ? ERR_PTR(ret) :  			follow_page_pte(vma, address, pmd, flags);  	} -  	page = follow_trans_huge_pmd(vma, address, pmd, flags);  	spin_unlock(ptl);  	*page_mask = HPAGE_PMD_NR - 1;  	return page;  } + +static struct page *follow_pud_mask(struct vm_area_struct *vma, +				    unsigned long address, p4d_t *p4dp, +				    unsigned int flags, unsigned int *page_mask) +{ +	pud_t *pud; +	spinlock_t *ptl; +	struct page *page; +	struct mm_struct *mm = vma->vm_mm; + +	pud = pud_offset(p4dp, address); +	if (pud_none(*pud)) +		return no_page_table(vma, flags); +	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { +		page = follow_huge_pud(mm, address, pud, flags); +		if (page) +			return page; +		return no_page_table(vma, flags); +	} +	if (is_hugepd(__hugepd(pud_val(*pud)))) { +		page = follow_huge_pd(vma, address, +				      __hugepd(pud_val(*pud)), flags, +				      PUD_SHIFT); +		if (page) +			return page; +		return no_page_table(vma, flags); +	} +	if (pud_devmap(*pud)) { +		ptl = pud_lock(mm, pud); +		page = follow_devmap_pud(vma, address, pud, flags); +		spin_unlock(ptl); +		if (page) +			return page; +	} +	if (unlikely(pud_bad(*pud))) +		return no_page_table(vma, flags); + +	return follow_pmd_mask(vma, address, pud, flags, page_mask); +} + + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, +				    unsigned long address, pgd_t *pgdp, +				    unsigned int flags, unsigned int *page_mask) +{ +	p4d_t *p4d; +	struct page *page; + +	p4d = p4d_offset(pgdp, address); +	if (p4d_none(*p4d)) +		return no_page_table(vma, flags); +	BUILD_BUG_ON(p4d_huge(*p4d)); +	if (unlikely(p4d_bad(*p4d))) +		return no_page_table(vma, flags); + +	if (is_hugepd(__hugepd(p4d_val(*p4d)))) { +		page = follow_huge_pd(vma, address, +				      __hugepd(p4d_val(*p4d)), flags, +				      P4D_SHIFT); +		if (page) +			return page; +		return no_page_table(vma, flags); +	} +	return follow_pud_mask(vma, address, p4d, flags, page_mask); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, +			      unsigned long address, unsigned int flags, +			      unsigned int *page_mask) +{ +	pgd_t *pgd; +	struct page *page; +	struct mm_struct *mm = vma->vm_mm; + +	*page_mask = 0; + +	/* make this handle hugepd */ +	page = follow_huge_addr(mm, address, flags & FOLL_WRITE); +	if (!IS_ERR(page)) { +		BUG_ON(flags & FOLL_GET); +		return page; +	} + +	pgd = pgd_offset(mm, address); + +	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) +		return no_page_table(vma, flags); + +	if (pgd_huge(*pgd)) { +		page = follow_huge_pgd(mm, address, pgd, flags); +		if (page) +			return page; +		return no_page_table(vma, flags); +	} +	if (is_hugepd(__hugepd(pgd_val(*pgd)))) { +		page = follow_huge_pd(vma, address, +				      __hugepd(pgd_val(*pgd)), flags, +				      PGDIR_SHIFT); +		if (page) +			return page; +		return no_page_table(vma, flags); +	} + +	return follow_p4d_mask(vma, address, pgd, flags, page_mask); +} +  static int get_gate_page(struct mm_struct *mm, unsigned long address,  		unsigned int gup_flags, struct vm_area_struct **vma,  		struct page **page) @@ -387,11 +461,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,  	/* mlock all present pages, but do not fault in new pages */  	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)  		return -ENOENT; -	/* For mm_populate(), just skip the stack guard page. */ -	if ((*flags & FOLL_POPULATE) && -			(stack_guard_page_start(vma, address) || -			 stack_guard_page_end(vma, address + PAGE_SIZE))) -		return -ENOENT;  	if (*flags & FOLL_WRITE)  		fault_flags |= FAULT_FLAG_WRITE;  	if (*flags & FOLL_REMOTE) @@ -1151,7 +1220,7 @@ struct page *get_dump_page(unsigned long addr)  #endif /* CONFIG_ELF_CORE */  /* - * Generic RCU Fast GUP + * Generic Fast GUP   *   * get_user_pages_fast attempts to pin user pages by walking the page   * tables directly and avoids taking locks. Thus the walker needs to be @@ -1172,8 +1241,8 @@ struct page *get_dump_page(unsigned long addr)   * Before activating this code, please be aware that the following assumptions   * are currently made:   * - *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - *      pages containing page tables. + *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + *  free pages containing page tables or TLB flushing requires IPI broadcast.   *   *  *) ptes can be read atomically by the architecture.   * @@ -1183,7 +1252,7 @@ struct page *get_dump_page(unsigned long addr)   *   * This code is based heavily on the PowerPC implementation by Nick Piggin.   */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP  #ifndef gup_get_pte  /* @@ -1354,16 +1423,15 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,  		return __gup_device_huge_pmd(orig, addr, end, pages, nr);  	refs = 0; -	head = pmd_page(orig); -	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); +	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page;  		(*nr)++;  		page++;  		refs++;  	} while (addr += PAGE_SIZE, addr != end); +	head = compound_head(pmd_page(orig));  	if (!page_cache_add_speculative(head, refs)) {  		*nr -= refs;  		return 0; @@ -1393,16 +1461,15 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,  		return __gup_device_huge_pud(orig, addr, end, pages, nr);  	refs = 0; -	head = pud_page(orig); -	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); +	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page;  		(*nr)++;  		page++;  		refs++;  	} while (addr += PAGE_SIZE, addr != end); +	head = compound_head(pud_page(orig));  	if (!page_cache_add_speculative(head, refs)) {  		*nr -= refs;  		return 0; @@ -1431,16 +1498,15 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,  	BUILD_BUG_ON(pgd_devmap(orig));  	refs = 0; -	head = pgd_page(orig); -	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); +	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page;  		(*nr)++;  		page++;  		refs++;  	} while (addr += PAGE_SIZE, addr != end); +	head = compound_head(pgd_page(orig));  	if (!page_cache_add_speculative(head, refs)) {  		*nr -= refs;  		return 0; @@ -1673,4 +1739,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,  	return ret;  } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a84909cf20d3..86975dec0ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)  	 */  	if (unlikely(pmd_trans_migrating(*vmf->pmd))) {  		page = pmd_page(*vmf->pmd); +		if (!get_page_unless_zero(page)) +			goto out_unlock;  		spin_unlock(vmf->ptl);  		wait_on_page_locked(page); +		put_page(page);  		goto out;  	} @@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)  	/* Migration could have started since the pmd_trans_migrating check */  	if (!page_locked) { +		page_nid = -1; +		if (!get_page_unless_zero(page)) +			goto out_unlock;  		spin_unlock(vmf->ptl);  		wait_on_page_locked(page); -		page_nid = -1; +		put_page(page);  		goto out;  	} @@ -1569,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,  		get_page(page);  		spin_unlock(ptl);  		split_huge_page(page); -		put_page(page);  		unlock_page(page); +		put_page(page);  		goto out_unlocked;  	} @@ -2197,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail,  	 * atomic_set() here would be safe on all archs (and not only on x86),  	 * it's safer to use atomic_inc()/atomic_add().  	 */ -	if (PageAnon(head)) { +	if (PageAnon(head) && !PageSwapCache(head)) {  		page_ref_inc(page_tail);  	} else {  		/* Additional pin to radix tree */ @@ -2208,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail,  	page_tail->flags |= (head->flags &  			((1L << PG_referenced) |  			 (1L << PG_swapbacked) | +			 (1L << PG_swapcache) |  			 (1L << PG_mlocked) |  			 (1L << PG_uptodate) |  			 (1L << PG_active) | @@ -2270,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,  	ClearPageCompound(head);  	/* See comment in __split_huge_page_tail() */  	if (PageAnon(head)) { -		page_ref_inc(head); +		/* Additional pin to radix tree of swap cache */ +		if (PageSwapCache(head)) +			page_ref_add(head, 2); +		else +			page_ref_inc(head);  	} else {  		/* Additional pin to radix tree */  		page_ref_add(head, 2); @@ -2379,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)  	return ret;  } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ +	int extra_pins; + +	/* Additional pins from radix tree */ +	if (PageAnon(page)) +		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; +	else +		extra_pins = HPAGE_PMD_NR; +	if (pextra_pins) +		*pextra_pins = extra_pins; +	return total_mapcount(page) == page_count(page) - extra_pins - 1; +} +  /*   * This function splits huge page into normal pages. @page can point to any   * subpage of huge page to split. Split doesn't change the position of @page. @@ -2426,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)  			ret = -EBUSY;  			goto out;  		} -		extra_pins = 0;  		mapping = NULL;  		anon_vma_lock_write(anon_vma);  	} else { @@ -2438,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)  			goto out;  		} -		/* Addidional pins from radix tree */ -		extra_pins = HPAGE_PMD_NR;  		anon_vma = NULL;  		i_mmap_lock_read(mapping);  	} @@ -2448,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)  	 * Racy check if we can split the page, before freeze_page() will  	 * split PMDs  	 */ -	if (total_mapcount(head) != page_count(head) - extra_pins - 1) { +	if (!can_split_huge_page(head, &extra_pins)) {  		ret = -EBUSY;  		goto out_unlock;  	} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3eedb187e549..1a88006ec634 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -867,7 +867,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)  	h->free_huge_pages_node[nid]++;  } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)  {  	struct page *page; @@ -887,6 +887,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)  	return page;  } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ +	struct page *page; +	int node; + +	if (nid != NUMA_NO_NODE) +		return dequeue_huge_page_node_exact(h, nid); + +	for_each_online_node(node) { +		page = dequeue_huge_page_node_exact(h, node); +		if (page) +			return page; +	} +	return NULL; +} +  /* Movability of hugepages depends on migration support. */  static inline gfp_t htlb_alloc_mask(struct hstate *h)  { @@ -904,6 +920,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,  	struct page *page = NULL;  	struct mempolicy *mpol;  	nodemask_t *nodemask; +	gfp_t gfp_mask; +	int nid;  	struct zonelist *zonelist;  	struct zone *zone;  	struct zoneref *z; @@ -924,12 +942,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,  retry_cpuset:  	cpuset_mems_cookie = read_mems_allowed_begin(); -	zonelist = huge_zonelist(vma, address, -					htlb_alloc_mask(h), &mpol, &nodemask); +	gfp_mask = htlb_alloc_mask(h); +	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); +	zonelist = node_zonelist(nid, gfp_mask);  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  						MAX_NR_ZONES - 1, nodemask) { -		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { +		if (cpuset_zone_allowed(zone, gfp_mask)) {  			page = dequeue_huge_page_node(h, zone_to_nid(zone));  			if (page) {  				if (avoid_reserve) @@ -1024,9 +1043,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)  		((node = hstate_next_node_to_free(hs, mask)) || 1);	\  		nr_nodes--) -#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ -	((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ -	defined(CONFIG_CMA)) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE  static void destroy_compound_gigantic_page(struct page *page,  					unsigned int order)  { @@ -1158,8 +1175,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h,  	return 0;  } -static inline bool gigantic_page_supported(void) { return true; } -#else +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */  static inline bool gigantic_page_supported(void) { return false; }  static inline void free_gigantic_page(struct page *page, unsigned int order) { }  static inline void destroy_compound_gigantic_page(struct page *page, @@ -1545,13 +1561,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,  	do {  		struct page *page;  		struct mempolicy *mpol; -		struct zonelist *zl; +		int nid;  		nodemask_t *nodemask;  		cpuset_mems_cookie = read_mems_allowed_begin(); -		zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); +		nid = huge_node(vma, addr, gfp, &mpol, &nodemask);  		mpol_cond_put(mpol); -		page = __alloc_pages_nodemask(gfp, order, zl, nodemask); +		page = __alloc_pages_nodemask(gfp, order, nid, nodemask);  		if (page)  			return page;  	} while (read_mems_allowed_retry(cpuset_mems_cookie)); @@ -3185,17 +3201,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,  		update_mmu_cache(vma, address, ptep);  } -static int is_hugetlb_entry_migration(pte_t pte) +bool is_hugetlb_entry_migration(pte_t pte)  {  	swp_entry_t swp;  	if (huge_pte_none(pte) || pte_present(pte)) -		return 0; +		return false;  	swp = pte_to_swp_entry(pte);  	if (non_swap_entry(swp) && is_migration_entry(swp)) -		return 1; +		return true;  	else -		return 0; +		return false;  }  static int is_hugetlb_entry_hwpoisoned(pte_t pte) @@ -3233,7 +3249,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {  		spinlock_t *src_ptl, *dst_ptl; -		src_pte = huge_pte_offset(src, addr); +		src_pte = huge_pte_offset(src, addr, sz);  		if (!src_pte)  			continue;  		dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3263,9 +3279,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  				 */  				make_migration_entry_read(&swp_entry);  				entry = swp_entry_to_pte(swp_entry); -				set_huge_pte_at(src, addr, src_pte, entry); +				set_huge_swap_pte_at(src, addr, src_pte, +						     entry, sz);  			} -			set_huge_pte_at(dst, addr, dst_pte, entry); +			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);  		} else {  			if (cow) {  				huge_ptep_set_wrprotect(src, addr, src_pte); @@ -3317,7 +3334,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  	address = start;  	for (; address < end; address += sz) { -		ptep = huge_pte_offset(mm, address); +		ptep = huge_pte_offset(mm, address, sz);  		if (!ptep)  			continue; @@ -3338,7 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  		 * unmapped and its refcount is dropped, so just clear pte here.  		 */  		if (unlikely(!pte_present(pte))) { -			huge_pte_clear(mm, address, ptep); +			huge_pte_clear(mm, address, ptep, sz);  			spin_unlock(ptl);  			continue;  		} @@ -3535,7 +3552,8 @@ retry_avoidcopy:  			unmap_ref_private(mm, vma, old_page, address);  			BUG_ON(huge_pte_none(pte));  			spin_lock(ptl); -			ptep = huge_pte_offset(mm, address & huge_page_mask(h)); +			ptep = huge_pte_offset(mm, address & huge_page_mask(h), +					       huge_page_size(h));  			if (likely(ptep &&  				   pte_same(huge_ptep_get(ptep), pte)))  				goto retry_avoidcopy; @@ -3574,7 +3592,8 @@ retry_avoidcopy:  	 * before the page tables are altered  	 */  	spin_lock(ptl); -	ptep = huge_pte_offset(mm, address & huge_page_mask(h)); +	ptep = huge_pte_offset(mm, address & huge_page_mask(h), +			       huge_page_size(h));  	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {  		ClearPagePrivate(new_page); @@ -3861,7 +3880,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	address &= huge_page_mask(h); -	ptep = huge_pte_offset(mm, address); +	ptep = huge_pte_offset(mm, address, huge_page_size(h));  	if (ptep) {  		entry = huge_ptep_get(ptep);  		if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4118,7 +4137,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  		 *  		 * Note that page table lock is not held when pte is null.  		 */ -		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); +		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), +				      huge_page_size(h));  		if (pte)  			ptl = huge_pte_lock(h, mm, pte);  		absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4257,7 +4277,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  	i_mmap_lock_write(vma->vm_file->f_mapping);  	for (; address < end; address += huge_page_size(h)) {  		spinlock_t *ptl; -		ptep = huge_pte_offset(mm, address); +		ptep = huge_pte_offset(mm, address, huge_page_size(h));  		if (!ptep)  			continue;  		ptl = huge_pte_lock(h, mm, ptep); @@ -4279,7 +4299,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  				make_migration_entry_read(&entry);  				newpte = swp_entry_to_pte(entry); -				set_huge_pte_at(mm, address, ptep, newpte); +				set_huge_swap_pte_at(mm, address, ptep, +						     newpte, huge_page_size(h));  				pages++;  			}  			spin_unlock(ptl); @@ -4521,7 +4542,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  		saddr = page_table_shareable(svma, vma, addr, idx);  		if (saddr) { -			spte = huge_pte_offset(svma->vm_mm, saddr); +			spte = huge_pte_offset(svma->vm_mm, saddr, +					       vma_mmu_pagesize(svma));  			if (spte) {  				get_page(virt_to_page(spte));  				break; @@ -4617,7 +4639,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,  	return pte;  } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, +		       unsigned long addr, unsigned long sz)  {  	pgd_t *pgd;  	p4d_t *p4d; @@ -4653,6 +4676,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,  }  struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, +	       unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ +	WARN(1, "hugepd follow called with no support for hugepage directory format\n"); +	return NULL; +} + +struct page * __weak  follow_huge_pmd(struct mm_struct *mm, unsigned long address,  		pmd_t *pmd, int flags)  { @@ -4699,6 +4730,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,  	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);  } +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) +{ +	if (flags & FOLL_GET) +		return NULL; + +	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); +} +  #ifdef CONFIG_MEMORY_FAILURE  /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 945fd1ca49b5..df4ebdb2b10a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,  			spin_unlock(ptl);  			free_page_and_swap_cache(src_page);  		} -		cond_resched();  	}  } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 20036d4f9f13..7780cd83a495 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -150,7 +150,7 @@ struct kmemleak_scan_area {   */  struct kmemleak_object {  	spinlock_t lock; -	unsigned long flags;		/* object status flags */ +	unsigned int flags;		/* object status flags */  	struct list_head object_list;  	struct list_head gray_list;  	struct rb_node rb_node; @@ -159,6 +159,8 @@ struct kmemleak_object {  	atomic_t use_count;  	unsigned long pointer;  	size_t size; +	/* pass surplus references to this pointer */ +	unsigned long excess_ref;  	/* minimum number of a pointers found before it is considered leak */  	int min_count;  	/* the total number of pointers found pointing to this object */ @@ -253,7 +255,8 @@ enum {  	KMEMLEAK_NOT_LEAK,  	KMEMLEAK_IGNORE,  	KMEMLEAK_SCAN_AREA, -	KMEMLEAK_NO_SCAN +	KMEMLEAK_NO_SCAN, +	KMEMLEAK_SET_EXCESS_REF  };  /* @@ -262,9 +265,12 @@ enum {   */  struct early_log {  	int op_type;			/* kmemleak operation type */ -	const void *ptr;		/* allocated/freed memory block */ -	size_t size;			/* memory block size */  	int min_count;			/* minimum reference count */ +	const void *ptr;		/* allocated/freed memory block */ +	union { +		size_t size;		/* memory block size */ +		unsigned long excess_ref; /* surplus reference passing */ +	};  	unsigned long trace[MAX_TRACE];	/* stack trace */  	unsigned int trace_len;		/* stack trace length */  }; @@ -393,7 +399,7 @@ static void dump_object_info(struct kmemleak_object *object)  		  object->comm, object->pid, object->jiffies);  	pr_notice("  min_count = %d\n", object->min_count);  	pr_notice("  count = %d\n", object->count); -	pr_notice("  flags = 0x%lx\n", object->flags); +	pr_notice("  flags = 0x%x\n", object->flags);  	pr_notice("  checksum = %u\n", object->checksum);  	pr_notice("  backtrace:\n");  	print_stack_trace(&trace, 4); @@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,  	object->flags = OBJECT_ALLOCATED;  	object->pointer = ptr;  	object->size = size; +	object->excess_ref = 0;  	object->min_count = min_count;  	object->count = 0;			/* white color initially */  	object->jiffies = jiffies; @@ -795,6 +802,30 @@ out:  }  /* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ +	unsigned long flags; +	struct kmemleak_object *object; + +	object = find_and_get_object(ptr, 0); +	if (!object) { +		kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", +			      ptr); +		return; +	} + +	spin_lock_irqsave(&object->lock, flags); +	object->excess_ref = excess_ref; +	spin_unlock_irqrestore(&object->lock, flags); +	put_object(object); +} + +/*   * Set the OBJECT_NO_SCAN flag for the object corresponding to the give   * pointer. Such object will not be scanned by kmemleak but references to it   * are searched. @@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log)   * @gfp:	kmalloc() flags used for kmemleak internal memory allocations   *   * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.).   */  void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,  			  gfp_t gfp) @@ -952,6 +983,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,  EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);  /** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area:	pointer to vm_struct + * @size:	size of the object + * @gfp:	__vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ +	pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + +	/* +	 * A min_count = 2 is needed because vm_struct contains a reference to +	 * the virtual address of the vmalloc'ed block. +	 */ +	if (kmemleak_enabled) { +		create_object((unsigned long)area->addr, size, 2, gfp); +		object_set_excess_ref((unsigned long)area, +				      (unsigned long)area->addr); +	} else if (kmemleak_early_log) { +		log_early(KMEMLEAK_ALLOC, area->addr, size, 2); +		/* reusing early_log.size for storing area->addr */ +		log_early(KMEMLEAK_SET_EXCESS_REF, +			  area, (unsigned long)area->addr, 0); +	} +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + +/**   * kmemleak_free - unregister a previously registered object   * @ptr:	pointer to beginning of the object   * @@ -1188,6 +1249,30 @@ static bool update_checksum(struct kmemleak_object *object)  }  /* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ +	if (!color_white(object)) { +		/* non-orphan, ignored or new */ +		return; +	} + +	/* +	 * Increase the object's reference count (number of pointers to the +	 * memory block). If this count reaches the required minimum, the +	 * object's color will become gray and it will be added to the +	 * gray_list. +	 */ +	object->count++; +	if (color_gray(object)) { +		/* put_object() called when removing from gray_list */ +		WARN_ON(!get_object(object)); +		list_add_tail(&object->gray_list, &gray_list); +	} +} + +/*   * Memory scanning is a long process and it needs to be interruptable. This   * function checks whether such interrupt condition occurred.   */ @@ -1224,6 +1309,7 @@ static void scan_block(void *_start, void *_end,  	for (ptr = start; ptr < end; ptr++) {  		struct kmemleak_object *object;  		unsigned long pointer; +		unsigned long excess_ref;  		if (scan_should_stop())  			break; @@ -1259,25 +1345,27 @@ static void scan_block(void *_start, void *_end,  		 * enclosed by scan_mutex.  		 */  		spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); -		if (!color_white(object)) { -			/* non-orphan, ignored or new */ -			spin_unlock(&object->lock); -			continue; -		} - -		/* -		 * Increase the object's reference count (number of pointers -		 * to the memory block). If this count reaches the required -		 * minimum, the object's color will become gray and it will be -		 * added to the gray_list. -		 */ -		object->count++; +		/* only pass surplus references (object already gray) */  		if (color_gray(object)) { -			/* put_object() called when removing from gray_list */ -			WARN_ON(!get_object(object)); -			list_add_tail(&object->gray_list, &gray_list); +			excess_ref = object->excess_ref; +			/* no need for update_refs() if object already gray */ +		} else { +			excess_ref = 0; +			update_refs(object);  		}  		spin_unlock(&object->lock); + +		if (excess_ref) { +			object = lookup_object(excess_ref, 0); +			if (!object) +				continue; +			if (object == scanned) +				/* circular reference, ignore */ +				continue; +			spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); +			update_refs(object); +			spin_unlock(&object->lock); +		}  	}  	read_unlock_irqrestore(&kmemleak_lock, flags);  } @@ -1980,6 +2068,10 @@ void __init kmemleak_init(void)  		case KMEMLEAK_NO_SCAN:  			kmemleak_no_scan(log->ptr);  			break; +		case KMEMLEAK_SET_EXCESS_REF: +			object_set_excess_ref((unsigned long)log->ptr, +					      log->excess_ref); +			break;  		default:  			kmemleak_warn("Unknown early log operation: %d\n",  				      log->op_type); @@ -128,9 +128,12 @@ struct ksm_scan {   * struct stable_node - node of the stable rbtree   * @node: rb node of this ksm page in the stable tree   * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain   * @list: linked into migrate_nodes, pending placement in the proper node tree   * @hlist: hlist head of rmap_items using this ksm page   * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN   * @nid: NUMA node id of stable tree in which linked (may not match kpfn)   */  struct stable_node { @@ -138,11 +141,24 @@ struct stable_node {  		struct rb_node node;	/* when node of stable tree */  		struct {		/* when listed for migration */  			struct list_head *head; -			struct list_head list; +			struct { +				struct hlist_node hlist_dup; +				struct list_head list; +			};  		};  	};  	struct hlist_head hlist; -	unsigned long kpfn; +	union { +		unsigned long kpfn; +		unsigned long chain_prune_time; +	}; +	/* +	 * STABLE_NODE_CHAIN can be any negative number in +	 * rmap_hlist_len negative range, but better not -1 to be able +	 * to reliably detect underflows. +	 */ +#define STABLE_NODE_CHAIN -1024 +	int rmap_hlist_len;  #ifdef CONFIG_NUMA  	int nid;  #endif @@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;  /* Recently migrated nodes of stable tree, pending proper placement */  static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)  #define MM_SLOTS_HASH_BITS 10  static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared;  /* The number of rmap_items in use: to calculate pages_volatile */  static unsigned long ksm_rmap_items; +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; +  /* Number of pages ksmd should scan in one batch */  static unsigned int ksm_thread_pages_to_scan = 100; @@ -287,6 +316,45 @@ static void __init ksm_slab_free(void)  	mm_slot_cache = NULL;  } +static __always_inline bool is_stable_node_chain(struct stable_node *chain) +{ +	return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct stable_node *dup) +{ +	return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct stable_node *dup, +					     struct stable_node *chain) +{ +	VM_BUG_ON(is_stable_node_dup(dup)); +	dup->head = STABLE_NODE_DUP_HEAD; +	VM_BUG_ON(!is_stable_node_chain(chain)); +	hlist_add_head(&dup->hlist_dup, &chain->hlist); +	ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct stable_node *dup) +{ +	VM_BUG_ON(!is_stable_node_dup(dup)); +	hlist_del(&dup->hlist_dup); +	ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct stable_node *dup) +{ +	VM_BUG_ON(is_stable_node_chain(dup)); +	if (is_stable_node_dup(dup)) +		__stable_node_dup_del(dup); +	else +		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM +	dup->head = NULL; +#endif +} +  static inline struct rmap_item *alloc_rmap_item(void)  {  	struct rmap_item *rmap_item; @@ -317,6 +385,8 @@ static inline struct stable_node *alloc_stable_node(void)  static inline void free_stable_node(struct stable_node *stable_node)  { +	VM_BUG_ON(stable_node->rmap_hlist_len && +		  !is_stable_node_chain(stable_node));  	kmem_cache_free(stable_node_cache, stable_node);  } @@ -498,25 +568,82 @@ static inline int get_kpfn_nid(unsigned long kpfn)  	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));  } +static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, +						   struct rb_root *root) +{ +	struct stable_node *chain = alloc_stable_node(); +	VM_BUG_ON(is_stable_node_chain(dup)); +	if (likely(chain)) { +		INIT_HLIST_HEAD(&chain->hlist); +		chain->chain_prune_time = jiffies; +		chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) +		chain->nid = -1; /* debug */ +#endif +		ksm_stable_node_chains++; + +		/* +		 * Put the stable node chain in the first dimension of +		 * the stable tree and at the same time remove the old +		 * stable node. +		 */ +		rb_replace_node(&dup->node, &chain->node, root); + +		/* +		 * Move the old stable node to the second dimension +		 * queued in the hlist_dup. The invariant is that all +		 * dup stable_nodes in the chain->hlist point to pages +		 * that are wrprotected and have the exact same +		 * content. +		 */ +		stable_node_chain_add_dup(dup, chain); +	} +	return chain; +} + +static inline void free_stable_node_chain(struct stable_node *chain, +					  struct rb_root *root) +{ +	rb_erase(&chain->node, root); +	free_stable_node(chain); +	ksm_stable_node_chains--; +} +  static void remove_node_from_stable_tree(struct stable_node *stable_node)  {  	struct rmap_item *rmap_item; +	/* check it's not STABLE_NODE_CHAIN or negative */ +	BUG_ON(stable_node->rmap_hlist_len < 0); +  	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {  		if (rmap_item->hlist.next)  			ksm_pages_sharing--;  		else  			ksm_pages_shared--; +		VM_BUG_ON(stable_node->rmap_hlist_len <= 0); +		stable_node->rmap_hlist_len--;  		put_anon_vma(rmap_item->anon_vma);  		rmap_item->address &= PAGE_MASK;  		cond_resched();  	} +	/* +	 * We need the second aligned pointer of the migrate_nodes +	 * list_head to stay clear from the rb_parent_color union +	 * (aligned and different than any node) and also different +	 * from &migrate_nodes. This will verify that future list.h changes +	 * don't break STABLE_NODE_DUP_HEAD. +	 */ +#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ +	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); +	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); +#endif +  	if (stable_node->head == &migrate_nodes)  		list_del(&stable_node->list);  	else -		rb_erase(&stable_node->node, -			 root_stable_tree + NUMA(stable_node->nid)); +		stable_node_dup_del(stable_node);  	free_stable_node(stable_node);  } @@ -635,6 +762,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)  			ksm_pages_sharing--;  		else  			ksm_pages_shared--; +		VM_BUG_ON(stable_node->rmap_hlist_len <= 0); +		stable_node->rmap_hlist_len--;  		put_anon_vma(rmap_item->anon_vma);  		rmap_item->address &= PAGE_MASK; @@ -743,6 +872,31 @@ static int remove_stable_node(struct stable_node *stable_node)  	return err;  } +static int remove_stable_node_chain(struct stable_node *stable_node, +				    struct rb_root *root) +{ +	struct stable_node *dup; +	struct hlist_node *hlist_safe; + +	if (!is_stable_node_chain(stable_node)) { +		VM_BUG_ON(is_stable_node_dup(stable_node)); +		if (remove_stable_node(stable_node)) +			return true; +		else +			return false; +	} + +	hlist_for_each_entry_safe(dup, hlist_safe, +				  &stable_node->hlist, hlist_dup) { +		VM_BUG_ON(!is_stable_node_dup(dup)); +		if (remove_stable_node(dup)) +			return true; +	} +	BUG_ON(!hlist_empty(&stable_node->hlist)); +	free_stable_node_chain(stable_node, root); +	return false; +} +  static int remove_all_stable_nodes(void)  {  	struct stable_node *stable_node, *next; @@ -753,7 +907,8 @@ static int remove_all_stable_nodes(void)  		while (root_stable_tree[nid].rb_node) {  			stable_node = rb_entry(root_stable_tree[nid].rb_node,  						struct stable_node, node); -			if (remove_stable_node(stable_node)) { +			if (remove_stable_node_chain(stable_node, +						     root_stable_tree + nid)) {  				err = -EBUSY;  				break;	/* proceed to next nid */  			} @@ -1138,6 +1293,214 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,  	return err ? NULL : page;  } +static __always_inline +bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +{ +	VM_BUG_ON(stable_node->rmap_hlist_len < 0); +	/* +	 * Check that at least one mapping still exists, otherwise +	 * there's no much point to merge and share with this +	 * stable_node, as the underlying tree_page of the other +	 * sharer is going to be freed soon. +	 */ +	return stable_node->rmap_hlist_len && +		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct stable_node *stable_node) +{ +	return __is_page_sharing_candidate(stable_node, 0); +} + +struct page *stable_node_dup(struct stable_node **_stable_node_dup, +			     struct stable_node **_stable_node, +			     struct rb_root *root, +			     bool prune_stale_stable_nodes) +{ +	struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; +	struct hlist_node *hlist_safe; +	struct page *_tree_page, *tree_page = NULL; +	int nr = 0; +	int found_rmap_hlist_len; + +	if (!prune_stale_stable_nodes || +	    time_before(jiffies, stable_node->chain_prune_time + +			msecs_to_jiffies( +				ksm_stable_node_chains_prune_millisecs))) +		prune_stale_stable_nodes = false; +	else +		stable_node->chain_prune_time = jiffies; + +	hlist_for_each_entry_safe(dup, hlist_safe, +				  &stable_node->hlist, hlist_dup) { +		cond_resched(); +		/* +		 * We must walk all stable_node_dup to prune the stale +		 * stable nodes during lookup. +		 * +		 * get_ksm_page can drop the nodes from the +		 * stable_node->hlist if they point to freed pages +		 * (that's why we do a _safe walk). The "dup" +		 * stable_node parameter itself will be freed from +		 * under us if it returns NULL. +		 */ +		_tree_page = get_ksm_page(dup, false); +		if (!_tree_page) +			continue; +		nr += 1; +		if (is_page_sharing_candidate(dup)) { +			if (!found || +			    dup->rmap_hlist_len > found_rmap_hlist_len) { +				if (found) +					put_page(tree_page); +				found = dup; +				found_rmap_hlist_len = found->rmap_hlist_len; +				tree_page = _tree_page; + +				/* skip put_page for found dup */ +				if (!prune_stale_stable_nodes) +					break; +				continue; +			} +		} +		put_page(_tree_page); +	} + +	if (found) { +		/* +		 * nr is counting all dups in the chain only if +		 * prune_stale_stable_nodes is true, otherwise we may +		 * break the loop at nr == 1 even if there are +		 * multiple entries. +		 */ +		if (prune_stale_stable_nodes && nr == 1) { +			/* +			 * If there's not just one entry it would +			 * corrupt memory, better BUG_ON. In KSM +			 * context with no lock held it's not even +			 * fatal. +			 */ +			BUG_ON(stable_node->hlist.first->next); + +			/* +			 * There's just one entry and it is below the +			 * deduplication limit so drop the chain. +			 */ +			rb_replace_node(&stable_node->node, &found->node, +					root); +			free_stable_node(stable_node); +			ksm_stable_node_chains--; +			ksm_stable_node_dups--; +			/* +			 * NOTE: the caller depends on the stable_node +			 * to be equal to stable_node_dup if the chain +			 * was collapsed. +			 */ +			*_stable_node = found; +			/* +			 * Just for robustneess as stable_node is +			 * otherwise left as a stable pointer, the +			 * compiler shall optimize it away at build +			 * time. +			 */ +			stable_node = NULL; +		} else if (stable_node->hlist.first != &found->hlist_dup && +			   __is_page_sharing_candidate(found, 1)) { +			/* +			 * If the found stable_node dup can accept one +			 * more future merge (in addition to the one +			 * that is underway) and is not at the head of +			 * the chain, put it there so next search will +			 * be quicker in the !prune_stale_stable_nodes +			 * case. +			 * +			 * NOTE: it would be inaccurate to use nr > 1 +			 * instead of checking the hlist.first pointer +			 * directly, because in the +			 * prune_stale_stable_nodes case "nr" isn't +			 * the position of the found dup in the chain, +			 * but the total number of dups in the chain. +			 */ +			hlist_del(&found->hlist_dup); +			hlist_add_head(&found->hlist_dup, +				       &stable_node->hlist); +		} +	} + +	*_stable_node_dup = found; +	return tree_page; +} + +static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, +					       struct rb_root *root) +{ +	if (!is_stable_node_chain(stable_node)) +		return stable_node; +	if (hlist_empty(&stable_node->hlist)) { +		free_stable_node_chain(stable_node, root); +		return NULL; +	} +	return hlist_entry(stable_node->hlist.first, +			   typeof(*stable_node), hlist_dup); +} + +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, +					struct stable_node **_stable_node, +					struct rb_root *root, +					bool prune_stale_stable_nodes) +{ +	struct stable_node *stable_node = *_stable_node; +	if (!is_stable_node_chain(stable_node)) { +		if (is_page_sharing_candidate(stable_node)) { +			*_stable_node_dup = stable_node; +			return get_ksm_page(stable_node, false); +		} +		/* +		 * _stable_node_dup set to NULL means the stable_node +		 * reached the ksm_max_page_sharing limit. +		 */ +		*_stable_node_dup = NULL; +		return NULL; +	} +	return stable_node_dup(_stable_node_dup, _stable_node, root, +			       prune_stale_stable_nodes); +} + +static __always_inline struct page *chain_prune(struct stable_node **s_n_d, +						struct stable_node **s_n, +						struct rb_root *root) +{ +	return __stable_node_chain(s_n_d, s_n, root, true); +} + +static __always_inline struct page *chain(struct stable_node **s_n_d, +					  struct stable_node *s_n, +					  struct rb_root *root) +{ +	struct stable_node *old_stable_node = s_n; +	struct page *tree_page; + +	tree_page = __stable_node_chain(s_n_d, &s_n, root, false); +	/* not pruning dups so s_n cannot have changed */ +	VM_BUG_ON(s_n != old_stable_node); +	return tree_page; +} +  /*   * stable_tree_search - search for page inside the stable tree   * @@ -1153,7 +1516,7 @@ static struct page *stable_tree_search(struct page *page)  	struct rb_root *root;  	struct rb_node **new;  	struct rb_node *parent; -	struct stable_node *stable_node; +	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;  	struct stable_node *page_node;  	page_node = page_stable_node(page); @@ -1175,7 +1538,44 @@ again:  		cond_resched();  		stable_node = rb_entry(*new, struct stable_node, node); -		tree_page = get_ksm_page(stable_node, false); +		stable_node_any = NULL; +		tree_page = chain_prune(&stable_node_dup, &stable_node,	root); +		/* +		 * NOTE: stable_node may have been freed by +		 * chain_prune() if the returned stable_node_dup is +		 * not NULL. stable_node_dup may have been inserted in +		 * the rbtree instead as a regular stable_node (in +		 * order to collapse the stable_node chain if a single +		 * stable_node dup was found in it). In such case the +		 * stable_node is overwritten by the calleee to point +		 * to the stable_node_dup that was collapsed in the +		 * stable rbtree and stable_node will be equal to +		 * stable_node_dup like if the chain never existed. +		 */ +		if (!stable_node_dup) { +			/* +			 * Either all stable_node dups were full in +			 * this stable_node chain, or this chain was +			 * empty and should be rb_erased. +			 */ +			stable_node_any = stable_node_dup_any(stable_node, +							      root); +			if (!stable_node_any) { +				/* rb_erase just run */ +				goto again; +			} +			/* +			 * Take any of the stable_node dups page of +			 * this stable_node chain to let the tree walk +			 * continue. All KSM pages belonging to the +			 * stable_node dups in a stable_node chain +			 * have the same content and they're +			 * wrprotected at all times. Any will work +			 * fine to continue the walk. +			 */ +			tree_page = get_ksm_page(stable_node_any, false); +		} +		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);  		if (!tree_page) {  			/*  			 * If we walked over a stale stable_node, @@ -1198,6 +1598,34 @@ again:  		else if (ret > 0)  			new = &parent->rb_right;  		else { +			if (page_node) { +				VM_BUG_ON(page_node->head != &migrate_nodes); +				/* +				 * Test if the migrated page should be merged +				 * into a stable node dup. If the mapcount is +				 * 1 we can migrate it with another KSM page +				 * without adding it to the chain. +				 */ +				if (page_mapcount(page) > 1) +					goto chain_append; +			} + +			if (!stable_node_dup) { +				/* +				 * If the stable_node is a chain and +				 * we got a payload match in memcmp +				 * but we cannot merge the scanned +				 * page in any of the existing +				 * stable_node dups because they're +				 * all full, we need to wait the +				 * scanned page to find itself a match +				 * in the unstable tree to create a +				 * brand new KSM page to add later to +				 * the dups of this stable_node. +				 */ +				return NULL; +			} +  			/*  			 * Lock and unlock the stable_node's page (which  			 * might already have been migrated) so that page @@ -1205,23 +1633,21 @@ again:  			 * It would be more elegant to return stable_node  			 * than kpage, but that involves more changes.  			 */ -			tree_page = get_ksm_page(stable_node, true); -			if (tree_page) { -				unlock_page(tree_page); -				if (get_kpfn_nid(stable_node->kpfn) != -						NUMA(stable_node->nid)) { -					put_page(tree_page); -					goto replace; -				} -				return tree_page; -			} -			/* -			 * There is now a place for page_node, but the tree may -			 * have been rebalanced, so re-evaluate parent and new. -			 */ -			if (page_node) +			tree_page = get_ksm_page(stable_node_dup, true); +			if (unlikely(!tree_page)) +				/* +				 * The tree may have been rebalanced, +				 * so re-evaluate parent and new. +				 */  				goto again; -			return NULL; +			unlock_page(tree_page); + +			if (get_kpfn_nid(stable_node_dup->kpfn) != +			    NUMA(stable_node_dup->nid)) { +				put_page(tree_page); +				goto replace; +			} +			return tree_page;  		}  	} @@ -1232,22 +1658,95 @@ again:  	DO_NUMA(page_node->nid = nid);  	rb_link_node(&page_node->node, parent, new);  	rb_insert_color(&page_node->node, root); -	get_page(page); -	return page; +out: +	if (is_page_sharing_candidate(page_node)) { +		get_page(page); +		return page; +	} else +		return NULL;  replace: -	if (page_node) { -		list_del(&page_node->list); -		DO_NUMA(page_node->nid = nid); -		rb_replace_node(&stable_node->node, &page_node->node, root); -		get_page(page); +	/* +	 * If stable_node was a chain and chain_prune collapsed it, +	 * stable_node has been updated to be the new regular +	 * stable_node. A collapse of the chain is indistinguishable +	 * from the case there was no chain in the stable +	 * rbtree. Otherwise stable_node is the chain and +	 * stable_node_dup is the dup to replace. +	 */ +	if (stable_node_dup == stable_node) { +		VM_BUG_ON(is_stable_node_chain(stable_node_dup)); +		VM_BUG_ON(is_stable_node_dup(stable_node_dup)); +		/* there is no chain */ +		if (page_node) { +			VM_BUG_ON(page_node->head != &migrate_nodes); +			list_del(&page_node->list); +			DO_NUMA(page_node->nid = nid); +			rb_replace_node(&stable_node_dup->node, +					&page_node->node, +					root); +			if (is_page_sharing_candidate(page_node)) +				get_page(page); +			else +				page = NULL; +		} else { +			rb_erase(&stable_node_dup->node, root); +			page = NULL; +		}  	} else { -		rb_erase(&stable_node->node, root); -		page = NULL; +		VM_BUG_ON(!is_stable_node_chain(stable_node)); +		__stable_node_dup_del(stable_node_dup); +		if (page_node) { +			VM_BUG_ON(page_node->head != &migrate_nodes); +			list_del(&page_node->list); +			DO_NUMA(page_node->nid = nid); +			stable_node_chain_add_dup(page_node, stable_node); +			if (is_page_sharing_candidate(page_node)) +				get_page(page); +			else +				page = NULL; +		} else { +			page = NULL; +		}  	} -	stable_node->head = &migrate_nodes; -	list_add(&stable_node->list, stable_node->head); +	stable_node_dup->head = &migrate_nodes; +	list_add(&stable_node_dup->list, stable_node_dup->head);  	return page; + +chain_append: +	/* stable_node_dup could be null if it reached the limit */ +	if (!stable_node_dup) +		stable_node_dup = stable_node_any; +	/* +	 * If stable_node was a chain and chain_prune collapsed it, +	 * stable_node has been updated to be the new regular +	 * stable_node. A collapse of the chain is indistinguishable +	 * from the case there was no chain in the stable +	 * rbtree. Otherwise stable_node is the chain and +	 * stable_node_dup is the dup to replace. +	 */ +	if (stable_node_dup == stable_node) { +		VM_BUG_ON(is_stable_node_chain(stable_node_dup)); +		VM_BUG_ON(is_stable_node_dup(stable_node_dup)); +		/* chain is missing so create it */ +		stable_node = alloc_stable_node_chain(stable_node_dup, +						      root); +		if (!stable_node) +			return NULL; +	} +	/* +	 * Add this stable_node dup that was +	 * migrated to the stable_node chain +	 * of the current nid for this page +	 * content. +	 */ +	VM_BUG_ON(!is_stable_node_chain(stable_node)); +	VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); +	VM_BUG_ON(page_node->head != &migrate_nodes); +	list_del(&page_node->list); +	DO_NUMA(page_node->nid = nid); +	stable_node_chain_add_dup(page_node, stable_node); +	goto out;  }  /* @@ -1264,7 +1763,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage)  	struct rb_root *root;  	struct rb_node **new;  	struct rb_node *parent; -	struct stable_node *stable_node; +	struct stable_node *stable_node, *stable_node_dup, *stable_node_any; +	bool need_chain = false;  	kpfn = page_to_pfn(kpage);  	nid = get_kpfn_nid(kpfn); @@ -1279,7 +1779,32 @@ again:  		cond_resched();  		stable_node = rb_entry(*new, struct stable_node, node); -		tree_page = get_ksm_page(stable_node, false); +		stable_node_any = NULL; +		tree_page = chain(&stable_node_dup, stable_node, root); +		if (!stable_node_dup) { +			/* +			 * Either all stable_node dups were full in +			 * this stable_node chain, or this chain was +			 * empty and should be rb_erased. +			 */ +			stable_node_any = stable_node_dup_any(stable_node, +							      root); +			if (!stable_node_any) { +				/* rb_erase just run */ +				goto again; +			} +			/* +			 * Take any of the stable_node dups page of +			 * this stable_node chain to let the tree walk +			 * continue. All KSM pages belonging to the +			 * stable_node dups in a stable_node chain +			 * have the same content and they're +			 * wrprotected at all times. Any will work +			 * fine to continue the walk. +			 */ +			tree_page = get_ksm_page(stable_node_any, false); +		} +		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);  		if (!tree_page) {  			/*  			 * If we walked over a stale stable_node, @@ -1302,27 +1827,37 @@ again:  		else if (ret > 0)  			new = &parent->rb_right;  		else { -			/* -			 * It is not a bug that stable_tree_search() didn't -			 * find this node: because at that time our page was -			 * not yet write-protected, so may have changed since. -			 */ -			return NULL; +			need_chain = true; +			break;  		}  	} -	stable_node = alloc_stable_node(); -	if (!stable_node) +	stable_node_dup = alloc_stable_node(); +	if (!stable_node_dup)  		return NULL; -	INIT_HLIST_HEAD(&stable_node->hlist); -	stable_node->kpfn = kpfn; -	set_page_stable_node(kpage, stable_node); -	DO_NUMA(stable_node->nid = nid); -	rb_link_node(&stable_node->node, parent, new); -	rb_insert_color(&stable_node->node, root); +	INIT_HLIST_HEAD(&stable_node_dup->hlist); +	stable_node_dup->kpfn = kpfn; +	set_page_stable_node(kpage, stable_node_dup); +	stable_node_dup->rmap_hlist_len = 0; +	DO_NUMA(stable_node_dup->nid = nid); +	if (!need_chain) { +		rb_link_node(&stable_node_dup->node, parent, new); +		rb_insert_color(&stable_node_dup->node, root); +	} else { +		if (!is_stable_node_chain(stable_node)) { +			struct stable_node *orig = stable_node; +			/* chain is missing so create it */ +			stable_node = alloc_stable_node_chain(orig, root); +			if (!stable_node) { +				free_stable_node(stable_node_dup); +				return NULL; +			} +		} +		stable_node_chain_add_dup(stable_node_dup, stable_node); +	} -	return stable_node; +	return stable_node_dup;  }  /* @@ -1412,8 +1947,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,   * the same ksm page.   */  static void stable_tree_append(struct rmap_item *rmap_item, -			       struct stable_node *stable_node) +			       struct stable_node *stable_node, +			       bool max_page_sharing_bypass)  { +	/* +	 * rmap won't find this mapping if we don't insert the +	 * rmap_item in the right stable_node +	 * duplicate. page_migration could break later if rmap breaks, +	 * so we can as well crash here. We really need to check for +	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check +	 * for other negative values as an undeflow if detected here +	 * for the first time (and not when decreasing rmap_hlist_len) +	 * would be sign of memory corruption in the stable_node. +	 */ +	BUG_ON(stable_node->rmap_hlist_len < 0); + +	stable_node->rmap_hlist_len++; +	if (!max_page_sharing_bypass) +		/* possibly non fatal but unexpected overflow, only warn */ +		WARN_ON_ONCE(stable_node->rmap_hlist_len > +			     ksm_max_page_sharing); +  	rmap_item->head = stable_node;  	rmap_item->address |= STABLE_FLAG;  	hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1441,19 +1995,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)  	struct page *kpage;  	unsigned int checksum;  	int err; +	bool max_page_sharing_bypass = false;  	stable_node = page_stable_node(page);  	if (stable_node) {  		if (stable_node->head != &migrate_nodes && -		    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { -			rb_erase(&stable_node->node, -				 root_stable_tree + NUMA(stable_node->nid)); +		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != +		    NUMA(stable_node->nid)) { +			stable_node_dup_del(stable_node);  			stable_node->head = &migrate_nodes;  			list_add(&stable_node->list, stable_node->head);  		}  		if (stable_node->head != &migrate_nodes &&  		    rmap_item->head == stable_node)  			return; +		/* +		 * If it's a KSM fork, allow it to go over the sharing limit +		 * without warnings. +		 */ +		if (!is_page_sharing_candidate(stable_node)) +			max_page_sharing_bypass = true;  	}  	/* We first start with searching the page inside the stable tree */ @@ -1473,7 +2034,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)  			 * add its rmap_item to the stable tree.  			 */  			lock_page(kpage); -			stable_tree_append(rmap_item, page_stable_node(kpage)); +			stable_tree_append(rmap_item, page_stable_node(kpage), +					   max_page_sharing_bypass);  			unlock_page(kpage);  		}  		put_page(kpage); @@ -1523,8 +2085,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)  			lock_page(kpage);  			stable_node = stable_tree_insert(kpage);  			if (stable_node) { -				stable_tree_append(tree_rmap_item, stable_node); -				stable_tree_append(rmap_item, stable_node); +				stable_tree_append(tree_rmap_item, stable_node, +						   false); +				stable_tree_append(rmap_item, stable_node, +						   false);  			}  			unlock_page(kpage); @@ -2028,6 +2592,48 @@ static void wait_while_offlining(void)  	}  } +static bool stable_node_dup_remove_range(struct stable_node *stable_node, +					 unsigned long start_pfn, +					 unsigned long end_pfn) +{ +	if (stable_node->kpfn >= start_pfn && +	    stable_node->kpfn < end_pfn) { +		/* +		 * Don't get_ksm_page, page has already gone: +		 * which is why we keep kpfn instead of page* +		 */ +		remove_node_from_stable_tree(stable_node); +		return true; +	} +	return false; +} + +static bool stable_node_chain_remove_range(struct stable_node *stable_node, +					   unsigned long start_pfn, +					   unsigned long end_pfn, +					   struct rb_root *root) +{ +	struct stable_node *dup; +	struct hlist_node *hlist_safe; + +	if (!is_stable_node_chain(stable_node)) { +		VM_BUG_ON(is_stable_node_dup(stable_node)); +		return stable_node_dup_remove_range(stable_node, start_pfn, +						    end_pfn); +	} + +	hlist_for_each_entry_safe(dup, hlist_safe, +				  &stable_node->hlist, hlist_dup) { +		VM_BUG_ON(!is_stable_node_dup(dup)); +		stable_node_dup_remove_range(dup, start_pfn, end_pfn); +	} +	if (hlist_empty(&stable_node->hlist)) { +		free_stable_node_chain(stable_node, root); +		return true; /* notify caller that tree was rebalanced */ +	} else +		return false; +} +  static void ksm_check_stable_tree(unsigned long start_pfn,  				  unsigned long end_pfn)  { @@ -2039,15 +2645,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn,  		node = rb_first(root_stable_tree + nid);  		while (node) {  			stable_node = rb_entry(node, struct stable_node, node); -			if (stable_node->kpfn >= start_pfn && -			    stable_node->kpfn < end_pfn) { -				/* -				 * Don't get_ksm_page, page has already gone: -				 * which is why we keep kpfn instead of page* -				 */ -				remove_node_from_stable_tree(stable_node); +			if (stable_node_chain_remove_range(stable_node, +							   start_pfn, end_pfn, +							   root_stable_tree + +							   nid))  				node = rb_first(root_stable_tree + nid); -			} else +			else  				node = rb_next(node);  			cond_resched();  		} @@ -2293,6 +2896,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj,  }  KSM_ATTR(use_zero_pages); +static ssize_t max_page_sharing_show(struct kobject *kobj, +				     struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, +				      struct kobj_attribute *attr, +				      const char *buf, size_t count) +{ +	int err; +	int knob; + +	err = kstrtoint(buf, 10, &knob); +	if (err) +		return err; +	/* +	 * When a KSM page is created it is shared by 2 mappings. This +	 * being a signed comparison, it implicitly verifies it's not +	 * negative. +	 */ +	if (knob < 2) +		return -EINVAL; + +	if (READ_ONCE(ksm_max_page_sharing) == knob) +		return count; + +	mutex_lock(&ksm_thread_mutex); +	wait_while_offlining(); +	if (ksm_max_page_sharing != knob) { +		if (ksm_pages_shared || remove_all_stable_nodes()) +			err = -EBUSY; +		else +			ksm_max_page_sharing = knob; +	} +	mutex_unlock(&ksm_thread_mutex); + +	return err ? err : count; +} +KSM_ATTR(max_page_sharing); +  static ssize_t pages_shared_show(struct kobject *kobj,  				 struct kobj_attribute *attr, char *buf)  { @@ -2331,6 +2975,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj,  }  KSM_ATTR_RO(pages_volatile); +static ssize_t stable_node_dups_show(struct kobject *kobj, +				     struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, +				       struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, +					struct kobj_attribute *attr, +					char *buf) +{ +	return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, +					 struct kobj_attribute *attr, +					 const char *buf, size_t count) +{ +	unsigned long msecs; +	int err; + +	err = kstrtoul(buf, 10, &msecs); +	if (err || msecs > UINT_MAX) +		return -EINVAL; + +	ksm_stable_node_chains_prune_millisecs = msecs; + +	return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); +  static ssize_t full_scans_show(struct kobject *kobj,  			       struct kobj_attribute *attr, char *buf)  { @@ -2350,6 +3034,10 @@ static struct attribute *ksm_attrs[] = {  #ifdef CONFIG_NUMA  	&merge_across_nodes_attr.attr,  #endif +	&max_page_sharing_attr.attr, +	&stable_node_chains_attr.attr, +	&stable_node_dups_attr.attr, +	&stable_node_chains_prune_millisecs_attr.attr,  	&use_zero_pages_attr.attr,  	NULL,  }; diff --git a/mm/memblock.c b/mm/memblock.c index 7b8a5db76a2f..2cb25fe4452c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,9 +54,6 @@ struct memblock memblock __initdata_memblock = {  };  int memblock_debug __initdata_memblock; -#ifdef CONFIG_MOVABLE_NODE -bool movable_node_enabled __initdata_memblock = false; -#endif  static bool system_has_some_mirror __initdata_memblock = false;  static int memblock_can_resize __initdata_memblock;  static int memblock_memory_in_slab __initdata_memblock = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..425aa0caa712 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event {  	 */  	poll_table pt;  	wait_queue_head_t *wqh; -	wait_queue_t wait; +	wait_queue_entry_t wait;  	struct work_struct remove;  }; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);  struct oom_wait_info {  	struct mem_cgroup *memcg; -	wait_queue_t	wait; +	wait_queue_entry_t	wait;  }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait,  	unsigned mode, int sync, void *arg)  {  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)  	owait.wait.flags = 0;  	owait.wait.func = memcg_oom_wake_function;  	owait.wait.private = current; -	INIT_LIST_HEAD(&owait.wait.task_list); +	INIT_LIST_HEAD(&owait.wait.entry);  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);  	mem_cgroup_mark_under_oom(memcg); @@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head)  #ifdef CONFIG_MEMCG_SWAP  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, -					 bool charge) +				       int nr_entries)  { -	int val = (charge) ? 1 : -1; -	this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); +	this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);  }  /** @@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,  	new_id = mem_cgroup_id(to);  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { -		mem_cgroup_swap_statistics(from, false); -		mem_cgroup_swap_statistics(to, true); +		mem_cgroup_swap_statistics(from, -1); +		mem_cgroup_swap_statistics(to, 1);  		return 0;  	}  	return -EINVAL; @@ -3574,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)  	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);  	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); +	seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));  	return 0;  } @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)   *   * Called with wqh->lock held and interrupts disabled.   */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,  			    int sync, void *key)  {  	struct mem_cgroup_event *event = @@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)  	if (!pn)  		return 1; +	pn->lruvec_stat = alloc_percpu(struct lruvec_stat); +	if (!pn->lruvec_stat) { +		kfree(pn); +		return 1; +	} +  	lruvec_init(&pn->lruvec);  	pn->usage_in_excess = 0;  	pn->on_tree = false; @@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)  static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)  { -	kfree(memcg->nodeinfo[node]); +	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + +	free_percpu(pn->lruvec_stat); +	kfree(pn);  }  static void __mem_cgroup_free(struct mem_cgroup *memcg) @@ -5165,6 +5174,7 @@ static int memory_events_show(struct seq_file *m, void *v)  	seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));  	seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));  	seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); +	seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));  	return 0;  } @@ -5197,8 +5207,8 @@ static int memory_stat_show(struct seq_file *m, void *v)  	seq_printf(m, "kernel_stack %llu\n",  		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);  	seq_printf(m, "slab %llu\n", -		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + -			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); +		   (u64)(stat[NR_SLAB_RECLAIMABLE] + +			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);  	seq_printf(m, "sock %llu\n",  		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5222,15 +5232,25 @@ static int memory_stat_show(struct seq_file *m, void *v)  	}  	seq_printf(m, "slab_reclaimable %llu\n", -		   (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); +		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);  	seq_printf(m, "slab_unreclaimable %llu\n", -		   (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); +		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);  	/* Accumulated memory events */  	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);  	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); +	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); +	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + +		   events[PGSCAN_DIRECT]); +	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + +		   events[PGSTEAL_DIRECT]); +	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); +	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); +	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); +	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); +  	seq_printf(m, "workingset_refault %lu\n",  		   stat[WORKINGSET_REFAULT]);  	seq_printf(m, "workingset_activate %lu\n", @@ -5445,7 +5465,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,  		 * let's not wait for it.  The page already received a  		 * memory+swap charge, drop the swap entry duplicate.  		 */ -		mem_cgroup_uncharge_swap(entry); +		mem_cgroup_uncharge_swap(entry, nr_pages);  	}  } @@ -5873,9 +5893,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  	 * ancestor for the swap instead and transfer the memory+swap charge.  	 */  	swap_memcg = mem_cgroup_id_get_online(memcg); -	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); +	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);  	VM_BUG_ON_PAGE(oldid, page); -	mem_cgroup_swap_statistics(swap_memcg, true); +	mem_cgroup_swap_statistics(swap_memcg, 1);  	page->mem_cgroup = NULL; @@ -5902,19 +5922,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)  		css_put(&memcg->css);  } -/* - * mem_cgroup_try_charge_swap - try charging a swap entry +/** + * mem_cgroup_try_charge_swap - try charging swap space for a page   * @page: page being added to swap   * @entry: swap entry to charge   * - * Try to charge @entry to the memcg that @page belongs to. + * Try to charge @page's memcg for the swap space at @entry.   *   * Returns 0 on success, -ENOMEM on failure.   */  int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)  { -	struct mem_cgroup *memcg; +	unsigned int nr_pages = hpage_nr_pages(page);  	struct page_counter *counter; +	struct mem_cgroup *memcg;  	unsigned short oldid;  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) @@ -5929,25 +5950,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)  	memcg = mem_cgroup_id_get_online(memcg);  	if (!mem_cgroup_is_root(memcg) && -	    !page_counter_try_charge(&memcg->swap, 1, &counter)) { +	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {  		mem_cgroup_id_put(memcg);  		return -ENOMEM;  	} -	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); +	/* Get references for the tail pages, too */ +	if (nr_pages > 1) +		mem_cgroup_id_get_many(memcg, nr_pages - 1); +	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);  	VM_BUG_ON_PAGE(oldid, page); -	mem_cgroup_swap_statistics(memcg, true); +	mem_cgroup_swap_statistics(memcg, nr_pages);  	return 0;  }  /** - * mem_cgroup_uncharge_swap - uncharge a swap entry + * mem_cgroup_uncharge_swap - uncharge swap space   * @entry: swap entry to uncharge - * - * Drop the swap charge associated with @entry. + * @nr_pages: the amount of swap space to uncharge   */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)  {  	struct mem_cgroup *memcg;  	unsigned short id; @@ -5955,18 +5978,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)  	if (!do_swap_account)  		return; -	id = swap_cgroup_record(entry, 0); +	id = swap_cgroup_record(entry, 0, nr_pages);  	rcu_read_lock();  	memcg = mem_cgroup_from_id(id);  	if (memcg) {  		if (!mem_cgroup_is_root(memcg)) {  			if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) -				page_counter_uncharge(&memcg->swap, 1); +				page_counter_uncharge(&memcg->swap, nr_pages);  			else -				page_counter_uncharge(&memcg->memsw, 1); +				page_counter_uncharge(&memcg->memsw, nr_pages);  		} -		mem_cgroup_swap_statistics(memcg, false); -		mem_cgroup_id_put(memcg); +		mem_cgroup_swap_statistics(memcg, -nr_pages); +		mem_cgroup_id_put_many(memcg, nr_pages);  	}  	rcu_read_unlock();  } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 342fac9ba89b..a74c8311db95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)  	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status  	 * correctly, we save a copy of the page flags at this time.  	 */ -	page_flags = p->flags; +	if (PageHuge(p)) +		page_flags = hpage->flags; +	else +		page_flags = p->flags;  	/*  	 * unpoison always clear PG_hwpoison inside page lock @@ -1489,11 +1492,16 @@ EXPORT_SYMBOL(unpoison_memory);  static struct page *new_page(struct page *p, unsigned long private, int **x)  {  	int nid = page_to_nid(p); -	if (PageHuge(p)) -		return alloc_huge_page_node(page_hstate(compound_head(p)), -						   nid); -	else +	if (PageHuge(p)) { +		struct hstate *hstate = page_hstate(compound_head(p)); + +		if (hstate_is_gigantic(hstate)) +			return alloc_huge_page_node(hstate, NUMA_NO_NODE); + +		return alloc_huge_page_node(hstate, nid); +	} else {  		return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); +	}  }  /* diff --git a/mm/memory.c b/mm/memory.c index 2e65df1831d9..e31dd97e6114 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf)  		/* Had to read the page from swap area: Major fault */  		ret = VM_FAULT_MAJOR;  		count_vm_event(PGMAJFAULT); -		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); +		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);  	} else if (PageHWPoison(page)) {  		/*  		 * hwpoisoned dirty swapcache pages are kept for killing @@ -2855,40 +2855,6 @@ out_release:  }  /* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ -	address &= PAGE_MASK; -	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { -		struct vm_area_struct *prev = vma->vm_prev; - -		/* -		 * Is there a mapping abutting this one below? -		 * -		 * That's only ok if it's the same stack mapping -		 * that has gotten split.. -		 */ -		if (prev && prev->vm_end == address) -			return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - -		return expand_downwards(vma, address - PAGE_SIZE); -	} -	if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { -		struct vm_area_struct *next = vma->vm_next; - -		/* As VM_GROWSDOWN but s/below/above/ */ -		if (next && next->vm_start == address + PAGE_SIZE) -			return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - -		return expand_upwards(vma, address + PAGE_SIZE); -	} -	return 0; -} - -/*   * We enter with non-exclusive mmap_sem (to exclude vma changes,   * but allow concurrent faults), and pte mapped but not yet locked.   * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf)  	if (vma->vm_flags & VM_SHARED)  		return VM_FAULT_SIGBUS; -	/* Check if we need to add a guard page to the stack */ -	if (check_stack_guard_page(vma, vmf->address) < 0) -		return VM_FAULT_SIGSEGV; -  	/*  	 * Use pte_alloc() instead of pte_alloc_map().  We can't run  	 * pte_offset_map() on pmds where a huge pmd might be created @@ -3875,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,  	__set_current_state(TASK_RUNNING);  	count_vm_event(PGFAULT); -	mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); +	count_memcg_event_mm(vma->vm_mm, PGFAULT);  	/* do counter updates before entering really critical section. */  	check_sync_rss_stat(current); @@ -4052,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,  		goto out;  	ptep = pte_offset_map_lock(mm, pmd, address, ptlp); -	if (!ptep) -		goto out;  	if (!pte_present(*ptep))  		goto unlock;  	*ptepp = ptep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b63d7d1239df..f79aac7a12b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -79,6 +79,8 @@ static struct {  #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)  #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map) +bool movable_node_enabled = false; +  #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE  bool memhp_auto_online;  #else @@ -300,229 +302,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)  }  #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, -				     unsigned long end_pfn) -{ -	unsigned long old_zone_end_pfn; - -	zone_span_writelock(zone); - -	old_zone_end_pfn = zone_end_pfn(zone); -	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) -		zone->zone_start_pfn = start_pfn; - -	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - -				zone->zone_start_pfn; - -	zone_span_writeunlock(zone); -} - -static void resize_zone(struct zone *zone, unsigned long start_pfn, -		unsigned long end_pfn) -{ -	zone_span_writelock(zone); - -	if (end_pfn - start_pfn) { -		zone->zone_start_pfn = start_pfn; -		zone->spanned_pages = end_pfn - start_pfn; -	} else { -		/* -		 * make it consist as free_area_init_core(), -		 * if spanned_pages = 0, then keep start_pfn = 0 -		 */ -		zone->zone_start_pfn = 0; -		zone->spanned_pages = 0; -	} - -	zone_span_writeunlock(zone); -} - -static void fix_zone_id(struct zone *zone, unsigned long start_pfn, -		unsigned long end_pfn) -{ -	enum zone_type zid = zone_idx(zone); -	int nid = zone->zone_pgdat->node_id; -	unsigned long pfn; - -	for (pfn = start_pfn; pfn < end_pfn; pfn++) -		set_page_links(pfn_to_page(pfn), zid, nid, pfn); -} - -/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ -static int __ref ensure_zone_is_initialized(struct zone *zone, -			unsigned long start_pfn, unsigned long num_pages) -{ -	if (!zone_is_initialized(zone)) -		return init_currently_empty_zone(zone, start_pfn, num_pages); - -	return 0; -} - -static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, -		unsigned long start_pfn, unsigned long end_pfn) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, +		bool want_memblock)  {  	int ret; -	unsigned long flags; -	unsigned long z1_start_pfn; - -	ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); -	if (ret) -		return ret; - -	pgdat_resize_lock(z1->zone_pgdat, &flags); - -	/* can't move pfns which are higher than @z2 */ -	if (end_pfn > zone_end_pfn(z2)) -		goto out_fail; -	/* the move out part must be at the left most of @z2 */ -	if (start_pfn > z2->zone_start_pfn) -		goto out_fail; -	/* must included/overlap */ -	if (end_pfn <= z2->zone_start_pfn) -		goto out_fail; - -	/* use start_pfn for z1's start_pfn if z1 is empty */ -	if (!zone_is_empty(z1)) -		z1_start_pfn = z1->zone_start_pfn; -	else -		z1_start_pfn = start_pfn; - -	resize_zone(z1, z1_start_pfn, end_pfn); -	resize_zone(z2, end_pfn, zone_end_pfn(z2)); - -	pgdat_resize_unlock(z1->zone_pgdat, &flags); - -	fix_zone_id(z1, start_pfn, end_pfn); - -	return 0; -out_fail: -	pgdat_resize_unlock(z1->zone_pgdat, &flags); -	return -1; -} - -static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, -		unsigned long start_pfn, unsigned long end_pfn) -{ -	int ret; -	unsigned long flags; -	unsigned long z2_end_pfn; - -	ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); -	if (ret) -		return ret; - -	pgdat_resize_lock(z1->zone_pgdat, &flags); - -	/* can't move pfns which are lower than @z1 */ -	if (z1->zone_start_pfn > start_pfn) -		goto out_fail; -	/* the move out part mast at the right most of @z1 */ -	if (zone_end_pfn(z1) >  end_pfn) -		goto out_fail; -	/* must included/overlap */ -	if (start_pfn >= zone_end_pfn(z1)) -		goto out_fail; - -	/* use end_pfn for z2's end_pfn if z2 is empty */ -	if (!zone_is_empty(z2)) -		z2_end_pfn = zone_end_pfn(z2); -	else -		z2_end_pfn = end_pfn; - -	resize_zone(z1, z1->zone_start_pfn, start_pfn); -	resize_zone(z2, start_pfn, z2_end_pfn); - -	pgdat_resize_unlock(z1->zone_pgdat, &flags); - -	fix_zone_id(z2, start_pfn, end_pfn); - -	return 0; -out_fail: -	pgdat_resize_unlock(z1->zone_pgdat, &flags); -	return -1; -} - -static struct zone * __meminit move_pfn_range(int zone_shift, -		unsigned long start_pfn, unsigned long end_pfn) -{ -	struct zone *zone = page_zone(pfn_to_page(start_pfn)); -	int ret = 0; - -	if (zone_shift < 0) -		ret = move_pfn_range_left(zone + zone_shift, zone, -					  start_pfn, end_pfn); -	else if (zone_shift) -		ret = move_pfn_range_right(zone, zone + zone_shift, -					   start_pfn, end_pfn); - -	if (ret) -		return NULL; - -	return zone + zone_shift; -} - -static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, -				      unsigned long end_pfn) -{ -	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - -	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) -		pgdat->node_start_pfn = start_pfn; - -	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - -					pgdat->node_start_pfn; -} +	int i; -static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ -	struct pglist_data *pgdat = zone->zone_pgdat; -	int nr_pages = PAGES_PER_SECTION; -	int nid = pgdat->node_id; -	int zone_type; -	unsigned long flags, pfn; -	int ret; +	if (pfn_valid(phys_start_pfn)) +		return -EEXIST; -	zone_type = zone - pgdat->node_zones; -	ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); -	if (ret) +	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); +	if (ret < 0)  		return ret; -	pgdat_resize_lock(zone->zone_pgdat, &flags); -	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); -	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, -			phys_start_pfn + nr_pages); -	pgdat_resize_unlock(zone->zone_pgdat, &flags); -	memmap_init_zone(nr_pages, nid, zone_type, -			 phys_start_pfn, MEMMAP_HOTPLUG); - -	/* online_page_range is called later and expects pages reserved */ -	for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { +	/* +	 * Make all the pages reserved so that nobody will stumble over half +	 * initialized state. +	 * FIXME: We also have to associate it with a node because pfn_to_node +	 * relies on having page with the proper node. +	 */ +	for (i = 0; i < PAGES_PER_SECTION; i++) { +		unsigned long pfn = phys_start_pfn + i; +		struct page *page;  		if (!pfn_valid(pfn))  			continue; -		SetPageReserved(pfn_to_page(pfn)); +		page = pfn_to_page(pfn); +		set_page_node(page, nid); +		SetPageReserved(page);  	} -	return 0; -} - -static int __meminit __add_section(int nid, struct zone *zone, -					unsigned long phys_start_pfn) -{ -	int ret; - -	if (pfn_valid(phys_start_pfn)) -		return -EEXIST; - -	ret = sparse_add_one_section(zone, phys_start_pfn); - -	if (ret < 0) -		return ret; - -	ret = __add_zone(zone, phys_start_pfn); -	if (ret < 0) -		return ret; +	if (!want_memblock) +		return 0;  	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));  } @@ -533,16 +344,14 @@ static int __meminit __add_section(int nid, struct zone *zone,   * call this function after deciding the zone to which to   * add the new pages.   */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, -			unsigned long nr_pages) +int __ref __add_pages(int nid, unsigned long phys_start_pfn, +			unsigned long nr_pages, bool want_memblock)  {  	unsigned long i;  	int err = 0;  	int start_sec, end_sec;  	struct vmem_altmap *altmap; -	clear_zone_contiguous(zone); -  	/* during initialize mem_map, align hot-added range to section */  	start_sec = pfn_to_section_nr(phys_start_pfn);  	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -562,7 +371,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,  	}  	for (i = start_sec; i <= end_sec; i++) { -		err = __add_section(nid, zone, section_nr_to_pfn(i)); +		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);  		/*  		 * EEXIST is finally dealt with by ioresource collision @@ -575,7 +384,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,  	}  	vmemmap_populate_print_last();  out: -	set_zone_contiguous(zone);  	return err;  }  EXPORT_SYMBOL_GPL(__add_pages); @@ -939,33 +747,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,  	unsigned long i;  	unsigned long onlined_pages = *(unsigned long *)arg;  	struct page *page; +  	if (PageReserved(pfn_to_page(start_pfn)))  		for (i = 0; i < nr_pages; i++) {  			page = pfn_to_page(start_pfn + i);  			(*online_page_callback)(page);  			onlined_pages++;  		} + +	online_mem_sections(start_pfn, start_pfn + nr_pages); +  	*(unsigned long *)arg = onlined_pages;  	return 0;  } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have - * normal memory. - */ -static bool can_online_high_movable(struct zone *zone) -{ -	return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(struct zone *zone) -{ -	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); -} -#endif /* CONFIG_MOVABLE_NODE */ -  /* check which state of node_states will be changed when online memory */  static void node_states_check_changes_online(unsigned long nr_pages,  	struct zone *zone, struct memory_notify *arg) @@ -1040,39 +835,131 @@ static void node_states_set_node(int node, struct memory_notify *arg)  	node_set_state(node, N_MEMORY);  } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, -		   enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)  { -	struct zone *zone = page_zone(pfn_to_page(pfn)); -	enum zone_type idx = zone_idx(zone); -	int i; +	struct pglist_data *pgdat = NODE_DATA(nid); +	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; +	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); -	*zone_shift = 0; +	/* +	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL +	 * physically before ZONE_MOVABLE. All we need is they do not +	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE +	 * though so let's stick with it for simplicity for now. +	 * TODO make sure we do not overlap with ZONE_DEVICE +	 */ +	if (online_type == MMOP_ONLINE_KERNEL) { +		if (zone_is_empty(movable_zone)) +			return true; +		return movable_zone->zone_start_pfn >= pfn + nr_pages; +	} else if (online_type == MMOP_ONLINE_MOVABLE) { +		return zone_end_pfn(default_zone) <= pfn; +	} -	if (idx < target) { -		/* pages must be at end of current zone */ -		if (pfn + nr_pages != zone_end_pfn(zone)) -			return false; +	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ +	return online_type == MMOP_ONLINE_KEEP; +} + +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, +		unsigned long nr_pages) +{ +	unsigned long old_end_pfn = zone_end_pfn(zone); + +	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) +		zone->zone_start_pfn = start_pfn; + +	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, +                                     unsigned long nr_pages) +{ +	unsigned long old_end_pfn = pgdat_end_pfn(pgdat); + +	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) +		pgdat->node_start_pfn = start_pfn; + +	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void __ref move_pfn_range_to_zone(struct zone *zone, +		unsigned long start_pfn, unsigned long nr_pages) +{ +	struct pglist_data *pgdat = zone->zone_pgdat; +	int nid = pgdat->node_id; +	unsigned long flags; -		/* no zones in use between current zone and target */ -		for (i = idx + 1; i < target; i++) -			if (zone_is_initialized(zone - idx + i)) -				return false; +	if (zone_is_empty(zone)) +		init_currently_empty_zone(zone, start_pfn, nr_pages); + +	clear_zone_contiguous(zone); + +	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ +	pgdat_resize_lock(pgdat, &flags); +	zone_span_writelock(zone); +	resize_zone_range(zone, start_pfn, nr_pages); +	zone_span_writeunlock(zone); +	resize_pgdat_range(pgdat, start_pfn, nr_pages); +	pgdat_resize_unlock(pgdat, &flags); + +	/* +	 * TODO now we have a visible range of pages which are not associated +	 * with their zone properly. Not nice but set_pfnblock_flags_mask +	 * expects the zone spans the pfn range. All the pages in the range +	 * are reserved so nobody should be touching them so we should be safe +	 */ +	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + +	set_zone_contiguous(zone); +} + +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, +		unsigned long nr_pages) +{ +	struct pglist_data *pgdat = NODE_DATA(nid); +	int zid; + +	for (zid = 0; zid <= ZONE_NORMAL; zid++) { +		struct zone *zone = &pgdat->node_zones[zid]; + +		if (zone_intersects(zone, start_pfn, nr_pages)) +			return zone;  	} -	if (target < idx) { -		/* pages must be at beginning of current zone */ -		if (pfn != zone->zone_start_pfn) -			return false; +	return &pgdat->node_zones[ZONE_NORMAL]; +} -		/* no zones in use between current zone and target */ -		for (i = target + 1; i < idx; i++) -			if (zone_is_initialized(zone - idx + i)) -				return false; +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, +		unsigned long start_pfn, unsigned long nr_pages) +{ +	struct pglist_data *pgdat = NODE_DATA(nid); +	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + +	if (online_type == MMOP_ONLINE_KEEP) { +		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; +		/* +		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use +		 * movable zone if that is not possible (e.g. we are within +		 * or past the existing movable zone) +		 */ +		if (!allow_online_pfn_range(nid, start_pfn, nr_pages, +					MMOP_ONLINE_KERNEL)) +			zone = movable_zone; +	} else if (online_type == MMOP_ONLINE_MOVABLE) { +		zone = &pgdat->node_zones[ZONE_MOVABLE];  	} -	*zone_shift = target - idx; -	return true; +	move_pfn_range_to_zone(zone, start_pfn, nr_pages); +	return zone;  }  /* Must be protected by mem_hotplug_begin() */ @@ -1085,38 +972,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ  	int nid;  	int ret;  	struct memory_notify arg; -	int zone_shift = 0; -	/* -	 * This doesn't need a lock to do pfn_to_page(). -	 * The section can't be removed here because of the -	 * memory_block->state_mutex. -	 */ -	zone = page_zone(pfn_to_page(pfn)); - -	if ((zone_idx(zone) > ZONE_NORMAL || -	    online_type == MMOP_ONLINE_MOVABLE) && -	    !can_online_high_movable(zone)) +	nid = pfn_to_nid(pfn); +	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))  		return -EINVAL; -	if (online_type == MMOP_ONLINE_KERNEL) { -		if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) -			return -EINVAL; -	} else if (online_type == MMOP_ONLINE_MOVABLE) { -		if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) -			return -EINVAL; -	} - -	zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); -	if (!zone) -		return -EINVAL; +	/* associate pfn range with the zone */ +	zone = move_pfn_range(online_type, nid, pfn, nr_pages);  	arg.start_pfn = pfn;  	arg.nr_pages = nr_pages;  	node_states_check_changes_online(nr_pages, zone, &arg); -	nid = zone_to_nid(zone); -  	ret = memory_notify(MEM_GOING_ONLINE, &arg);  	ret = notifier_to_errno(ret);  	if (ret) @@ -1311,39 +1178,6 @@ static int check_hotplug_memory_range(u64 start, u64 size)  	return 0;  } -/* - * If movable zone has already been setup, newly added memory should be check. - * If its address is higher than movable zone, it should be added as movable. - * Without this check, movable zone may overlap with other zone. - */ -static int should_add_memory_movable(int nid, u64 start, u64 size) -{ -	unsigned long start_pfn = start >> PAGE_SHIFT; -	pg_data_t *pgdat = NODE_DATA(nid); -	struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; - -	if (zone_is_empty(movable_zone)) -		return 0; - -	if (movable_zone->zone_start_pfn <= start_pfn) -		return 1; - -	return 0; -} - -int zone_for_memory(int nid, u64 start, u64 size, int zone_default, -		bool for_device) -{ -#ifdef CONFIG_ZONE_DEVICE -	if (for_device) -		return ZONE_DEVICE; -#endif -	if (should_add_memory_movable(nid, start, size)) -		return ZONE_MOVABLE; - -	return zone_default; -} -  static int online_memory_block(struct memory_block *mem, void *arg)  {  	return device_online(&mem->dev); @@ -1389,7 +1223,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)  	}  	/* call arch's memory hotadd */ -	ret = arch_add_memory(nid, start, size, false); +	ret = arch_add_memory(nid, start, size, true);  	if (ret < 0)  		goto error; @@ -1398,7 +1232,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)  	node_set_online(nid);  	if (new_node) { -		ret = register_one_node(nid); +		unsigned long start_pfn = start >> PAGE_SHIFT; +		unsigned long nr_pages = size >> PAGE_SHIFT; + +		ret = __register_one_node(nid); +		if (ret) +			goto register_fail; + +		/* +		 * link memory sections under this node. This is already +		 * done when creatig memory section in register_new_memory +		 * but that depends to have the node registered so offline +		 * nodes have to go through register_node. +		 * TODO clean up this mess. +		 */ +		ret = link_mem_sections(nid, start_pfn, nr_pages); +register_fail:  		/*  		 * If sysfs file of new node can't create, cpu on the node  		 * can't be hot-added. There is no rollback way now. @@ -1592,11 +1441,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,  		gfp_mask |= __GFP_HIGHMEM;  	if (!nodes_empty(nmask)) -		new_page = __alloc_pages_nodemask(gfp_mask, 0, -					node_zonelist(nid, gfp_mask), &nmask); +		new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);  	if (!new_page) -		new_page = __alloc_pages(gfp_mask, 0, -					node_zonelist(nid, gfp_mask)); +		new_page = __alloc_pages(gfp_mask, 0, nid);  	return new_page;  } @@ -1725,47 +1572,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)  	return offlined;  } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have - * normal memory. - */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ -	return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure the node has NORMAL memory if it is still online */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ -	struct pglist_data *pgdat = zone->zone_pgdat; -	unsigned long present_pages = 0; -	enum zone_type zt; - -	for (zt = 0; zt <= ZONE_NORMAL; zt++) -		present_pages += pgdat->node_zones[zt].present_pages; - -	if (present_pages > nr_pages) -		return true; - -	present_pages = 0; -	for (; zt <= ZONE_MOVABLE; zt++) -		present_pages += pgdat->node_zones[zt].present_pages; - -	/* -	 * we can't offline the last normal memory until all -	 * higher memory is offlined. -	 */ -	return present_pages == 0; -} -#endif /* CONFIG_MOVABLE_NODE */ -  static int __init cmdline_parse_movable_node(char *p)  { -#ifdef CONFIG_MOVABLE_NODE +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  	movable_node_enabled = true;  #else -	pr_warn("movable_node option not supported\n"); +	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");  #endif  	return 0;  } @@ -1887,9 +1699,6 @@ static int __ref __offline_pages(unsigned long start_pfn,  	node = zone_to_nid(zone);  	nr_pages = end_pfn - start_pfn; -	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) -		return -EINVAL; -  	/* set above range as isolated */  	ret = start_isolate_page_range(start_pfn, end_pfn,  				       MIGRATE_MOVABLE, true); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..7d8e56214ac0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)  static const struct mempolicy_operations {  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes); -	/* -	 * If read-side task has no lock to protect task->mempolicy, write-side -	 * task will rebind the task->mempolicy by two step. The first step is -	 * setting all the newly nodes, and the second step is cleaning all the -	 * disallowed nodes. In this way, we can avoid finding no node to alloc -	 * page. -	 * If we have a lock to protect task->mempolicy in read-side, we do -	 * rebind directly. -	 * -	 * step: -	 * 	MPOL_REBIND_ONCE - do rebind work at once -	 * 	MPOL_REBIND_STEP1 - set all the newly nodes -	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes -	 */ -	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, -			enum mpol_rebind_step step); +	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);  } mpol_ops[MPOL_MAX];  static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)  	kmem_cache_free(policy_cache, p);  } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, -				enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)  {  } -/* - * step: - * 	MPOL_REBIND_ONCE  - do rebind work at once - * 	MPOL_REBIND_STEP1 - set all the newly nodes - * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, -				 enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)  {  	nodemask_t tmp; @@ -325,41 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,  	else if (pol->flags & MPOL_F_RELATIVE_NODES)  		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);  	else { -		/* -		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the -		 * result -		 */ -		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { -			nodes_remap(tmp, pol->v.nodes, -					pol->w.cpuset_mems_allowed, *nodes); -			pol->w.cpuset_mems_allowed = step ? tmp : *nodes; -		} else if (step == MPOL_REBIND_STEP2) { -			tmp = pol->w.cpuset_mems_allowed; -			pol->w.cpuset_mems_allowed = *nodes; -		} else -			BUG(); +		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, +								*nodes); +		pol->w.cpuset_mems_allowed = tmp;  	}  	if (nodes_empty(tmp))  		tmp = *nodes; -	if (step == MPOL_REBIND_STEP1) -		nodes_or(pol->v.nodes, pol->v.nodes, tmp); -	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) -		pol->v.nodes = tmp; -	else -		BUG(); - -	if (!node_isset(current->il_next, tmp)) { -		current->il_next = next_node_in(current->il_next, tmp); -		if (current->il_next >= MAX_NUMNODES) -			current->il_next = numa_node_id(); -	} +	pol->v.nodes = tmp;  }  static void mpol_rebind_preferred(struct mempolicy *pol, -				  const nodemask_t *nodes, -				  enum mpol_rebind_step step) +						const nodemask_t *nodes)  {  	nodemask_t tmp; @@ -385,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,  /*   * mpol_rebind_policy - Migrate a policy to a different set of nodes   * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * 	MPOL_REBIND_ONCE  - do rebind work at once - * 	MPOL_REBIND_STEP1 - set all the newly nodes - * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification.   */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, -				enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)  {  	if (!pol)  		return; -	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && +	if (!mpol_store_user_nodemask(pol) &&  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))  		return; -	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) -		return; - -	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) -		BUG(); - -	if (step == MPOL_REBIND_STEP1) -		pol->flags |= MPOL_F_REBINDING; -	else if (step == MPOL_REBIND_STEP2) -		pol->flags &= ~MPOL_F_REBINDING; -	else if (step >= MPOL_REBIND_NSTEP) -		BUG(); - -	mpol_ops[pol->mode].rebind(pol, newmask, step); +	mpol_ops[pol->mode].rebind(pol, newmask);  }  /* @@ -430,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,   * Called with task's alloc_lock held.   */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, -			enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)  { -	mpol_rebind_policy(tsk->mempolicy, new, step); +	mpol_rebind_policy(tsk->mempolicy, new);  }  /* @@ -448,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)  	down_write(&mm->mmap_sem);  	for (vma = mm->mmap; vma; vma = vma->vm_next) -		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); +		mpol_rebind_policy(vma->vm_policy, new);  	up_write(&mm->mmap_sem);  } @@ -812,9 +743,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,  	}  	old = current->mempolicy;  	current->mempolicy = new; -	if (new && new->mode == MPOL_INTERLEAVE && -	    nodes_weight(new->v.nodes)) -		current->il_next = first_node(new->v.nodes); +	if (new && new->mode == MPOL_INTERLEAVE) +		current->il_prev = MAX_NUMNODES-1;  	task_unlock(current);  	mpol_put(old);  	ret = 0; @@ -916,7 +846,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,  			*policy = err;  		} else if (pol == current->mempolicy &&  				pol->mode == MPOL_INTERLEAVE) { -			*policy = current->il_next; +			*policy = next_node_in(current->il_prev, pol->v.nodes);  		} else {  			err = -EINVAL;  			goto out; @@ -1676,9 +1606,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)  	return NULL;  } -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, -	int nd) +/* Return the node id preferred by the given mempolicy, or the given id */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, +								int nd)  {  	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))  		nd = policy->v.preferred_node; @@ -1691,20 +1621,19 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,  		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));  	} -	return node_zonelist(nd, gfp); +	return nd;  }  /* Do dynamic interleaving for a process */  static unsigned interleave_nodes(struct mempolicy *policy)  { -	unsigned nid, next; +	unsigned next;  	struct task_struct *me = current; -	nid = me->il_next; -	next = next_node_in(nid, policy->v.nodes); +	next = next_node_in(me->il_prev, policy->v.nodes);  	if (next < MAX_NUMNODES) -		me->il_next = next; -	return nid; +		me->il_prev = next; +	return next;  }  /* @@ -1799,38 +1728,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol,  #ifdef CONFIG_HUGETLBFS  /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * huge_node(@vma, @addr, @gfp_flags, @mpol)   * @vma: virtual memory area whose policy is sought   * @addr: address in @vma for shared policy lookup and interleave policy   * @gfp_flags: for requested zone   * @mpol: pointer to mempolicy pointer for reference counted mempolicy   * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask   * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer   * to the struct mempolicy for conditional unref after allocation.   * If the effective policy is 'BIND, returns a pointer to the mempolicy's   * @nodemask for filtering the zonelist.   *   * Must be protected by read_mems_allowed_begin()   */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, -				gfp_t gfp_flags, struct mempolicy **mpol, -				nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, +				struct mempolicy **mpol, nodemask_t **nodemask)  { -	struct zonelist *zl; +	int nid;  	*mpol = get_vma_policy(vma, addr);  	*nodemask = NULL;	/* assume !MPOL_BIND */  	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { -		zl = node_zonelist(interleave_nid(*mpol, vma, addr, -				huge_page_shift(hstate_vma(vma))), gfp_flags); +		nid = interleave_nid(*mpol, vma, addr, +					huge_page_shift(hstate_vma(vma)));  	} else { -		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); +		nid = policy_node(gfp_flags, *mpol, numa_node_id());  		if ((*mpol)->mode == MPOL_BIND)  			*nodemask = &(*mpol)->v.nodes;  	} -	return zl; +	return nid;  }  /* @@ -1932,12 +1860,10 @@ out:  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,  					unsigned nid)  { -	struct zonelist *zl;  	struct page *page; -	zl = node_zonelist(nid, gfp); -	page = __alloc_pages(gfp, order, zl); -	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) +	page = __alloc_pages(gfp, order, nid); +	if (page && page_to_nid(page) == nid)  		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);  	return page;  } @@ -1971,13 +1897,10 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,  {  	struct mempolicy *pol;  	struct page *page; -	unsigned int cpuset_mems_cookie; -	struct zonelist *zl; +	int preferred_nid;  	nodemask_t *nmask; -retry_cpuset:  	pol = get_vma_policy(vma, addr); -	cpuset_mems_cookie = read_mems_allowed_begin();  	if (pol->mode == MPOL_INTERLEAVE) {  		unsigned nid; @@ -2015,12 +1938,10 @@ retry_cpuset:  	}  	nmask = policy_nodemask(gfp, pol); -	zl = policy_zonelist(gfp, pol, node); -	page = __alloc_pages_nodemask(gfp, order, zl, nmask); +	preferred_nid = policy_node(gfp, pol, node); +	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);  	mpol_cond_put(pol);  out: -	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) -		goto retry_cpuset;  	return page;  } @@ -2038,23 +1959,15 @@ out:   *	Allocate a page from the kernel page pool.  When not in   *	interrupt context and apply the current process NUMA policy.   *	Returns NULL when no page can be allocated. - * - *	Don't call cpuset_update_task_memory_state() unless - *	1) it's ok to take cpuset_sem (can WAIT), and - *	2) allocating for current task (not interrupt).   */  struct page *alloc_pages_current(gfp_t gfp, unsigned order)  {  	struct mempolicy *pol = &default_policy;  	struct page *page; -	unsigned int cpuset_mems_cookie;  	if (!in_interrupt() && !(gfp & __GFP_THISNODE))  		pol = get_task_policy(current); -retry_cpuset: -	cpuset_mems_cookie = read_mems_allowed_begin(); -  	/*  	 * No reference counting needed for current->mempolicy  	 * nor system default_policy @@ -2063,12 +1976,9 @@ retry_cpuset:  		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));  	else  		page = __alloc_pages_nodemask(gfp, order, -				policy_zonelist(gfp, pol, numa_node_id()), +				policy_node(gfp, pol, numa_node_id()),  				policy_nodemask(gfp, pol)); -	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) -		goto retry_cpuset; -  	return page;  }  EXPORT_SYMBOL(alloc_pages_current); @@ -2112,10 +2022,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)  	if (current_cpuset_is_being_rebound()) {  		nodemask_t mems = cpuset_mems_allowed(current); -		if (new->flags & MPOL_F_REBINDING) -			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); -		else -			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); +		mpol_rebind_policy(new, &mems);  	}  	atomic_set(&new->refcnt, 1);  	return new; diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)  {  	void *element;  	unsigned long flags; -	wait_queue_t wait; +	wait_queue_entry_t wait;  	gfp_t gfp_temp;  	VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/migrate.c b/mm/migrate.c index 89a0a1707f4c..051cc1555d36 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,  		if (is_write_migration_entry(entry))  			pte = maybe_mkwrite(pte, vma); +		flush_dcache_page(new);  #ifdef CONFIG_HUGETLB_PAGE  		if (PageHuge(new)) {  			pte = pte_mkhuge(pte);  			pte = arch_make_huge_pte(pte, vma, new, 0); -		} -#endif -		flush_dcache_page(new); -		set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - -		if (PageHuge(new)) { +			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);  			if (PageAnon(new))  				hugepage_add_anon_rmap(new, vma, pvmw.address);  			else  				page_dup_rmap(new, true); -		} else if (PageAnon(new)) -			page_add_anon_rmap(new, vma, pvmw.address, false); -		else -			page_add_file_rmap(new, false); +		} else +#endif +		{ +			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); +			if (PageAnon(new)) +				page_add_anon_rmap(new, vma, pvmw.address, false); +			else +				page_add_file_rmap(new, false); +		}  		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))  			mlock_vma_page(new); diff --git a/mm/mmap.c b/mm/mmap.c index f82741e199c0..5a0ba9788cdd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm,   *								w: (no) no   *								x: (yes) yes   */ -pgprot_t protection_map[16] = { +pgprot_t protection_map[16] __ro_after_init = {  	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,  	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111  }; @@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)  	unsigned long retval;  	unsigned long newbrk, oldbrk;  	struct mm_struct *mm = current->mm; +	struct vm_area_struct *next;  	unsigned long min_brk;  	bool populate;  	LIST_HEAD(uf); @@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)  	}  	/* Check against existing mmap mappings. */ -	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) +	next = find_vma(mm, oldbrk); +	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))  		goto out;  	/* Ok, looks good - let it rip. */ @@ -253,10 +255,22 @@ out:  static long vma_compute_subtree_gap(struct vm_area_struct *vma)  { -	unsigned long max, subtree_gap; -	max = vma->vm_start; -	if (vma->vm_prev) -		max -= vma->vm_prev->vm_end; +	unsigned long max, prev_end, subtree_gap; + +	/* +	 * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we +	 * allow two stack_guard_gaps between them here, and when choosing +	 * an unmapped area; whereas when expanding we only require one. +	 * That's a little inconsistent, but keeps the code here simpler. +	 */ +	max = vm_start_gap(vma); +	if (vma->vm_prev) { +		prev_end = vm_end_gap(vma->vm_prev); +		if (max > prev_end) +			max -= prev_end; +		else +			max = 0; +	}  	if (vma->vm_rb.rb_left) {  		subtree_gap = rb_entry(vma->vm_rb.rb_left,  				struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm)  			anon_vma_unlock_read(anon_vma);  		} -		highest_address = vma->vm_end; +		highest_address = vm_end_gap(vma);  		vma = vma->vm_next;  		i++;  	} @@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,  	if (vma->vm_next)  		vma_gap_update(vma->vm_next);  	else -		mm->highest_vm_end = vma->vm_end; +		mm->highest_vm_end = vm_end_gap(vma);  	/*  	 * vma->vm_prev wasn't known when we followed the rbtree to find the @@ -856,7 +870,7 @@ again:  			vma_gap_update(vma);  		if (end_changed) {  			if (!next) -				mm->highest_vm_end = end; +				mm->highest_vm_end = vm_end_gap(vma);  			else if (!adjust_next)  				vma_gap_update(next);  		} @@ -941,7 +955,7 @@ again:  			 * mm->highest_vm_end doesn't need any update  			 * in remove_next == 1 case.  			 */ -			VM_WARN_ON(mm->highest_vm_end != end); +			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));  		}  	}  	if (insert && file) @@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)  	while (true) {  		/* Visit left subtree if it looks promising */ -		gap_end = vma->vm_start; +		gap_end = vm_start_gap(vma);  		if (gap_end >= low_limit && vma->vm_rb.rb_left) {  			struct vm_area_struct *left =  				rb_entry(vma->vm_rb.rb_left, @@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)  			}  		} -		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +		gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;  check_current:  		/* Check if current node has a suitable gap */  		if (gap_start > high_limit)  			return -ENOMEM; -		if (gap_end >= low_limit && gap_end - gap_start >= length) +		if (gap_end >= low_limit && +		    gap_end > gap_start && gap_end - gap_start >= length)  			goto found;  		/* Visit right subtree if it looks promising */ @@ -1825,8 +1840,8 @@ check_current:  			vma = rb_entry(rb_parent(prev),  				       struct vm_area_struct, vm_rb);  			if (prev == vma->vm_rb.rb_left) { -				gap_start = vma->vm_prev->vm_end; -				gap_end = vma->vm_start; +				gap_start = vm_end_gap(vma->vm_prev); +				gap_end = vm_start_gap(vma);  				goto check_current;  			}  		} @@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)  	while (true) {  		/* Visit right subtree if it looks promising */ -		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +		gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;  		if (gap_start <= high_limit && vma->vm_rb.rb_right) {  			struct vm_area_struct *right =  				rb_entry(vma->vm_rb.rb_right, @@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)  check_current:  		/* Check if current node has a suitable gap */ -		gap_end = vma->vm_start; +		gap_end = vm_start_gap(vma);  		if (gap_end < low_limit)  			return -ENOMEM; -		if (gap_start <= high_limit && gap_end - gap_start >= length) +		if (gap_start <= high_limit && +		    gap_end > gap_start && gap_end - gap_start >= length)  			goto found;  		/* Visit left subtree if it looks promising */ @@ -1929,7 +1945,7 @@ check_current:  				       struct vm_area_struct, vm_rb);  			if (prev == vma->vm_rb.rb_right) {  				gap_start = vma->vm_prev ? -					vma->vm_prev->vm_end : 0; +					vm_end_gap(vma->vm_prev) : 0;  				goto check_current;  			}  		} @@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,  		unsigned long len, unsigned long pgoff, unsigned long flags)  {  	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; +	struct vm_area_struct *vma, *prev;  	struct vm_unmapped_area_info info;  	if (len > TASK_SIZE - mmap_min_addr) @@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,  	if (addr) {  		addr = PAGE_ALIGN(addr); -		vma = find_vma(mm, addr); +		vma = find_vma_prev(mm, addr, &prev);  		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && -		    (!vma || addr + len <= vma->vm_start)) +		    (!vma || addr + len <= vm_start_gap(vma)) && +		    (!prev || addr >= vm_end_gap(prev)))  			return addr;  	} @@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  			  const unsigned long len, const unsigned long pgoff,  			  const unsigned long flags)  { -	struct vm_area_struct *vma; +	struct vm_area_struct *vma, *prev;  	struct mm_struct *mm = current->mm;  	unsigned long addr = addr0;  	struct vm_unmapped_area_info info; @@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,  	/* requesting a specific address */  	if (addr) {  		addr = PAGE_ALIGN(addr); -		vma = find_vma(mm, addr); +		vma = find_vma_prev(mm, addr, &prev);  		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && -				(!vma || addr + len <= vma->vm_start)) +				(!vma || addr + len <= vm_start_gap(vma)) && +				(!prev || addr >= vm_end_gap(prev)))  			return addr;  	} @@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,   * update accounting. This is shared with both the   * grow-up and grow-down cases.   */ -static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, +			     unsigned long size, unsigned long grow)  {  	struct mm_struct *mm = vma->vm_mm;  	struct rlimit *rlim = current->signal->rlim; -	unsigned long new_start, actual_size; +	unsigned long new_start;  	/* address space limit tests */  	if (!may_expand_vm(mm, vma->vm_flags, grow))  		return -ENOMEM;  	/* Stack limit test */ -	actual_size = size; -	if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) -		actual_size -= PAGE_SIZE; -	if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) +	if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))  		return -ENOMEM;  	/* mlock limit tests */ @@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns  int expand_upwards(struct vm_area_struct *vma, unsigned long address)  {  	struct mm_struct *mm = vma->vm_mm; +	struct vm_area_struct *next; +	unsigned long gap_addr;  	int error = 0;  	if (!(vma->vm_flags & VM_GROWSUP))  		return -EFAULT; -	/* Guard against wrapping around to address 0. */ -	if (address < PAGE_ALIGN(address+4)) -		address = PAGE_ALIGN(address+4); -	else +	/* Guard against exceeding limits of the address space. */ +	address &= PAGE_MASK; +	if (address >= TASK_SIZE)  		return -ENOMEM; +	address += PAGE_SIZE; + +	/* Enforce stack_guard_gap */ +	gap_addr = address + stack_guard_gap; + +	/* Guard against overflow */ +	if (gap_addr < address || gap_addr > TASK_SIZE) +		gap_addr = TASK_SIZE; + +	next = vma->vm_next; +	if (next && next->vm_start < gap_addr) { +		if (!(next->vm_flags & VM_GROWSUP)) +			return -ENOMEM; +		/* Check that both stack segments have the same anon_vma? */ +	}  	/* We must make sure the anon_vma is allocated. */  	if (unlikely(anon_vma_prepare(vma))) @@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)  				if (vma->vm_next)  					vma_gap_update(vma->vm_next);  				else -					mm->highest_vm_end = address; +					mm->highest_vm_end = vm_end_gap(vma);  				spin_unlock(&mm->page_table_lock);  				perf_event_mmap(vma); @@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma,  				   unsigned long address)  {  	struct mm_struct *mm = vma->vm_mm; +	struct vm_area_struct *prev; +	unsigned long gap_addr;  	int error;  	address &= PAGE_MASK; @@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma,  	if (error)  		return error; +	/* Enforce stack_guard_gap */ +	gap_addr = address - stack_guard_gap; +	if (gap_addr > address) +		return -ENOMEM; +	prev = vma->vm_prev; +	if (prev && prev->vm_end > gap_addr) { +		if (!(prev->vm_flags & VM_GROWSDOWN)) +			return -ENOMEM; +		/* Check that both stack segments have the same anon_vma? */ +	} +  	/* We must make sure the anon_vma is allocated. */  	if (unlikely(anon_vma_prepare(vma)))  		return -ENOMEM; @@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma,  	return error;  } -/* - * Note how expand_stack() refuses to expand the stack all the way to - * abut the next virtual mapping, *unless* that mapping itself is also - * a stack mapping. We want to leave room for a guard page, after all - * (the guard page itself is not added here, that is done by the - * actual page faulting logic) - * - * This matches the behavior of the guard page logic (see mm/memory.c: - * check_stack_guard_page()), which only allows the guard page to be - * removed under these circumstances. - */ +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +static int __init cmdline_parse_stack_guard_gap(char *p) +{ +	unsigned long val; +	char *endptr; + +	val = simple_strtoul(p, &endptr, 10); +	if (!*endptr) +		stack_guard_gap = val << PAGE_SHIFT; + +	return 0; +} +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); +  #ifdef CONFIG_STACK_GROWSUP  int expand_stack(struct vm_area_struct *vma, unsigned long address)  { -	struct vm_area_struct *next; - -	address &= PAGE_MASK; -	next = vma->vm_next; -	if (next && next->vm_start == address + PAGE_SIZE) { -		if (!(next->vm_flags & VM_GROWSUP)) -			return -ENOMEM; -	}  	return expand_upwards(vma, address);  } @@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)  #else  int expand_stack(struct vm_area_struct *vma, unsigned long address)  { -	struct vm_area_struct *prev; - -	address &= PAGE_MASK; -	prev = vma->vm_prev; -	if (prev && prev->vm_end == address) { -		if (!(prev->vm_flags & VM_GROWSDOWN)) -			return -ENOMEM; -	}  	return expand_downwards(vma, address);  } @@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,  		vma->vm_prev = prev;  		vma_gap_update(vma);  	} else -		mm->highest_vm_end = prev ? prev->vm_end : 0; +		mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;  	tail_vma->vm_next = NULL;  	/* Kill the cache */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 8edd0d576254..1a8c9ca83e48 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  	 * reading.  	 */  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); -	if (!pte) -		return 0;  	/* Get target node for single threaded private VMAs */  	if (prot_numa && !(vma->vm_flags & VM_SHARED) && diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 487dad610731..36454d0f96ee 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,  	unsigned long end_pfn = min_t(unsigned long,  				      PFN_DOWN(end), max_low_pfn); -	if (start_pfn > end_pfn) +	if (start_pfn >= end_pfn)  		return 0;  	__free_pages_memory(start_pfn, end_pfn); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04c9143a8625..0e2c925e7826 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message)  	/* Get a reference to safely compare mm after task_unlock(victim) */  	mm = victim->mm;  	mmgrab(mm); + +	/* Raise event before sending signal: task reaper must see this */ +	count_vm_event(OOM_KILL); +	count_memcg_event_mm(mm, OOM_KILL); +  	/*  	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent  	 * the OOM victim from depleting the memory reserves from the user diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 143c1c25d680..8989eada0ef7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2433,8 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)  		inode_attach_wb(inode, page);  		wb = inode_to_wb(inode); -		inc_memcg_page_state(page, NR_FILE_DIRTY); -		__inc_node_page_state(page, NR_FILE_DIRTY); +		__inc_lruvec_page_state(page, NR_FILE_DIRTY);  		__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);  		__inc_node_page_state(page, NR_DIRTIED);  		__inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2455,8 +2454,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,  			  struct bdi_writeback *wb)  {  	if (mapping_cap_account_dirty(mapping)) { -		dec_memcg_page_state(page, NR_FILE_DIRTY); -		dec_node_page_state(page, NR_FILE_DIRTY); +		dec_lruvec_page_state(page, NR_FILE_DIRTY);  		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);  		dec_wb_stat(wb, WB_RECLAIMABLE);  		task_io_account_cancelled_write(PAGE_SIZE); @@ -2712,8 +2710,7 @@ int clear_page_dirty_for_io(struct page *page)  		 */  		wb = unlocked_inode_to_wb_begin(inode, &locked);  		if (TestClearPageDirty(page)) { -			dec_memcg_page_state(page, NR_FILE_DIRTY); -			dec_node_page_state(page, NR_FILE_DIRTY); +			dec_lruvec_page_state(page, NR_FILE_DIRTY);  			dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);  			dec_wb_stat(wb, WB_RECLAIMABLE);  			ret = 1; @@ -2759,8 +2756,7 @@ int test_clear_page_writeback(struct page *page)  		ret = TestClearPageWriteback(page);  	}  	if (ret) { -		dec_memcg_page_state(page, NR_WRITEBACK); -		dec_node_page_state(page, NR_WRITEBACK); +		dec_lruvec_page_state(page, NR_WRITEBACK);  		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);  		inc_node_page_state(page, NR_WRITTEN);  	} @@ -2814,8 +2810,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)  		ret = TestSetPageWriteback(page);  	}  	if (!ret) { -		inc_memcg_page_state(page, NR_WRITEBACK); -		inc_node_page_state(page, NR_WRITEBACK); +		inc_lruvec_page_state(page, NR_WRITEBACK);  		inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);  	}  	unlock_page_memcg(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..bd65b60939b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {  #ifdef CONFIG_HIGHMEM  	[N_HIGH_MEMORY] = { { [0] = 1UL } },  #endif -#ifdef CONFIG_MOVABLE_NODE  	[N_MEMORY] = { { [0] = 1UL } }, -#endif  	[N_CPU] = { { [0] = 1UL } },  #endif	/* NUMA */  }; @@ -511,7 +509,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)  /*   * Temporary debugging check for pages not lying within a given zone.   */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page)  {  	if (page_outside_zone_boundaries(zone, page))  		return 1; @@ -521,7 +519,7 @@ static int bad_range(struct zone *zone, struct page *page)  	return 0;  }  #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)  {  	return 0;  } @@ -1297,8 +1295,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)  #endif  #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, -					struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, +		   struct mminit_pfnnid_cache *state)  {  	int nid; @@ -1320,8 +1319,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)  {  	return true;  } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, -					struct mminit_pfnnid_cache *state) +static inline bool __meminit  __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, +		   struct mminit_pfnnid_cache *state)  {  	return true;  } @@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,  	if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))  		return NULL; -	start_page = pfn_to_page(start_pfn); +	start_page = pfn_to_online_page(start_pfn); +	if (!start_page) +		return NULL;  	if (page_zone(start_page) != zone)  		return NULL; @@ -3673,6 +3675,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,  	return false;  } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ +	/* +	 * It's possible that cpuset's mems_allowed and the nodemask from +	 * mempolicy don't intersect. This should be normally dealt with by +	 * policy_nodemask(), but it's possible to race with cpuset update in +	 * such a way the check therein was true, and then it became false +	 * before we got our cpuset_mems_cookie here. +	 * This assumes that for all allocations, ac->nodemask can come only +	 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored +	 * when it does not intersect with the cpuset restrictions) or the +	 * caller can deal with a violated nodemask. +	 */ +	if (cpusets_enabled() && ac->nodemask && +			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { +		ac->nodemask = NULL; +		return true; +	} + +	/* +	 * When updating a task's mems_allowed or mempolicy nodemask, it is +	 * possible to race with parallel threads in such a way that our +	 * allocation can fail while the mask is being updated. If we are about +	 * to fail, check if the cpuset changed during allocation and if so, +	 * retry. +	 */ +	if (read_mems_allowed_retry(cpuset_mems_cookie)) +		return true; + +	return false; +} +  static inline struct page *  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  						struct alloc_context *ac) @@ -3868,11 +3903,9 @@ retry:  				&compaction_retries))  		goto retry; -	/* -	 * It's possible we raced with cpuset update so the OOM would be -	 * premature (see below the nopage: label for full explanation). -	 */ -	if (read_mems_allowed_retry(cpuset_mems_cookie)) + +	/* Deal with possible cpuset update races before we start OOM killing */ +	if (check_retry_cpuset(cpuset_mems_cookie, ac))  		goto retry_cpuset;  	/* Reclaim has failed us, start killing things */ @@ -3893,14 +3926,8 @@ retry:  	}  nopage: -	/* -	 * When updating a task's mems_allowed or mempolicy nodemask, it is -	 * possible to race with parallel threads in such a way that our -	 * allocation can fail while the mask is being updated. If we are about -	 * to fail, check if the cpuset changed during allocation and if so, -	 * retry. -	 */ -	if (read_mems_allowed_retry(cpuset_mems_cookie)) +	/* Deal with possible cpuset update races before we fail */ +	if (check_retry_cpuset(cpuset_mems_cookie, ac))  		goto retry_cpuset;  	/* @@ -3951,12 +3978,12 @@ got_pg:  }  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, -		struct zonelist *zonelist, nodemask_t *nodemask, +		int preferred_nid, nodemask_t *nodemask,  		struct alloc_context *ac, gfp_t *alloc_mask,  		unsigned int *alloc_flags)  {  	ac->high_zoneidx = gfp_zone(gfp_mask); -	ac->zonelist = zonelist; +	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);  	ac->nodemask = nodemask;  	ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4001,8 +4028,8 @@ static inline void finalise_ac(gfp_t gfp_mask,   * This is the 'heart' of the zoned buddy allocator.   */  struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, -			struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, +							nodemask_t *nodemask)  {  	struct page *page;  	unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4010,7 +4037,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  	struct alloc_context ac = { };  	gfp_mask &= gfp_allowed_mask; -	if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) +	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))  		return NULL;  	finalise_ac(gfp_mask, order, &ac); @@ -4614,8 +4641,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)  			" present:%lukB"  			" managed:%lukB"  			" mlocked:%lukB" -			" slab_reclaimable:%lukB" -			" slab_unreclaimable:%lukB"  			" kernel_stack:%lukB"  			" pagetables:%lukB"  			" bounce:%lukB" @@ -4637,8 +4662,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)  			K(zone->present_pages),  			K(zone->managed_pages),  			K(zone_page_state(zone, NR_MLOCK)), -			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), -			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),  			zone_page_state(zone, NR_KERNEL_STACK_KB),  			K(zone_page_state(zone, NR_PAGETABLE)),  			K(zone_page_state(zone, NR_BOUNCE)), @@ -5124,6 +5147,7 @@ static void build_zonelists(pg_data_t *pgdat)   */  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);  static void setup_zone_pageset(struct zone *zone);  /* @@ -5528,7 +5552,7 @@ static __meminit void zone_pcp_init(struct zone *zone)  					 zone_batchsize(zone));  } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone,  					unsigned long zone_start_pfn,  					unsigned long size)  { @@ -5546,8 +5570,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,  	zone_init_free_lists(zone);  	zone->initialized = 1; - -	return 0;  }  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6005,7 +6027,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  {  	enum zone_type j;  	int nid = pgdat->node_id; -	int ret;  	pgdat_resize_init(pgdat);  #ifdef CONFIG_NUMA_BALANCING @@ -6027,6 +6048,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  	spin_lock_init(&pgdat->lru_lock);  	lruvec_init(node_lruvec(pgdat)); +	pgdat->per_cpu_nodestats = &boot_nodestats; +  	for (j = 0; j < MAX_NR_ZONES; j++) {  		struct zone *zone = pgdat->node_zones + j;  		unsigned long size, realsize, freesize, memmap_pages; @@ -6087,8 +6110,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)  		set_pageblock_order();  		setup_usemap(pgdat, zone, zone_start_pfn, size); -		ret = init_currently_empty_zone(zone, zone_start_pfn, size); -		BUG_ON(ret); +		init_currently_empty_zone(zone, zone_start_pfn, size);  		memmap_init(size, nid, j, zone_start_pfn);  	}  } @@ -7182,6 +7204,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)  #endif  /* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE	(64ul << 30) +#define ADAPT_SCALE_SHIFT	2 +#define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/*   * allocate a large system hash table from bootmem   * - it is assumed that the hash table must contain an exact power-of-2   *   quantity of entries @@ -7200,6 +7237,7 @@ void *__init alloc_large_system_hash(const char *tablename,  	unsigned long long max = high_limit;  	unsigned long log2qty, size;  	void *table = NULL; +	gfp_t gfp_flags;  	/* allow the kernel cmdline to have a say */  	if (!numentries) { @@ -7211,6 +7249,16 @@ void *__init alloc_large_system_hash(const char *tablename,  		if (PAGE_SHIFT < 20)  			numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 +		if (!high_limit) { +			unsigned long adapt; + +			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; +			     adapt <<= ADAPT_SCALE_SHIFT) +				scale++; +		} +#endif +  		/* limit to 1 bucket per 2^scale bytes of low memory */  		if (scale > PAGE_SHIFT)  			numentries >>= (scale - PAGE_SHIFT); @@ -7244,12 +7292,17 @@ void *__init alloc_large_system_hash(const char *tablename,  	log2qty = ilog2(numentries); +	/* +	 * memblock allocator returns zeroed memory already, so HASH_ZERO is +	 * currently not used when HASH_EARLY is specified. +	 */ +	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;  	do {  		size = bucketsize << log2qty;  		if (flags & HASH_EARLY)  			table = memblock_virt_alloc_nopanic(size, 0);  		else if (hashdist) -			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); +			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);  		else {  			/*  			 * If bucketsize is not a power-of-two, we may free @@ -7257,8 +7310,8 @@ void *__init alloc_large_system_hash(const char *tablename,  			 * alloc_pages_exact() automatically does  			 */  			if (get_order(size) < MAX_ORDER) { -				table = alloc_pages_exact(size, GFP_ATOMIC); -				kmemleak_alloc(table, size, 1, GFP_ATOMIC); +				table = alloc_pages_exact(size, gfp_flags); +				kmemleak_alloc(table, size, 1, gfp_flags);  			}  		}  	} while (!table && size > PAGE_SIZE && --log2qty); @@ -7660,6 +7713,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  			break;  	if (pfn == end_pfn)  		return; +	offline_mem_sections(pfn, end_pfn);  	zone = page_zone(pfn_to_page(pfn));  	spin_lock_irqsave(&zone->lock, flags);  	pfn = start_pfn; diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..2da71e627812 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio)  {  	struct page *page = bio->bi_io_vec[0].bv_page; -	if (bio->bi_error) { +	if (bio->bi_status) {  		SetPageError(page);  		/*  		 * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio)  {  	struct page *page = bio->bi_io_vec[0].bv_page; -	if (bio->bi_error) { +	if (bio->bi_status) {  		SetPageError(page);  		ClearPageUptodate(page);  		pr_alert("Read-error on swap-device (%u:%u:%llu)\n", diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5092e4ef00c8..3606104893e0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -138,12 +138,18 @@ static inline struct page *  __first_valid_page(unsigned long pfn, unsigned long nr_pages)  {  	int i; -	for (i = 0; i < nr_pages; i++) -		if (pfn_valid_within(pfn + i)) -			break; -	if (unlikely(i == nr_pages)) -		return NULL; -	return pfn_to_page(pfn + i); + +	for (i = 0; i < nr_pages; i++) { +		struct page *page; + +		if (!pfn_valid_within(pfn + i)) +			continue; +		page = pfn_to_online_page(pfn + i); +		if (!page) +			continue; +		return page; +	} +	return NULL;  }  /* @@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,  undo:  	for (pfn = start_pfn;  	     pfn < undo_pfn; -	     pfn += pageblock_nr_pages) -		unset_migratetype_isolate(pfn_to_page(pfn), migratetype); +	     pfn += pageblock_nr_pages) { +		struct page *page = pfn_to_online_page(pfn); +		if (!page) +			continue; +		unset_migratetype_isolate(page, migratetype); +	}  	return -EBUSY;  } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)  	if (unlikely(PageHuge(pvmw->page))) {  		/* when pud is not present, pte will be NULL */ -		pvmw->pte = huge_pte_offset(mm, pvmw->address); +		pvmw->pte = huge_pte_offset(mm, pvmw->address, +					    PAGE_SIZE << compound_order(page));  		if (!pvmw->pte)  			return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,  	struct hstate *h = hstate_vma(vma);  	unsigned long next;  	unsigned long hmask = huge_page_mask(h); +	unsigned long sz = huge_page_size(h);  	pte_t *pte;  	int err = 0;  	do {  		next = hugetlb_entry_end(h, addr, end); -		pte = huge_pte_offset(walk->mm, addr & hmask); +		pte = huge_pte_offset(walk->mm, addr & hmask, sz);  		if (pte && walk->hugetlb_entry)  			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);  		if (err) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h new file mode 100644 index 000000000000..cd2442e13d8f --- /dev/null +++ b/mm/percpu-internal.h @@ -0,0 +1,166 @@ +#ifndef _MM_PERCPU_INTERNAL_H +#define _MM_PERCPU_INTERNAL_H + +#include <linux/types.h> +#include <linux/percpu.h> + +struct pcpu_chunk { +#ifdef CONFIG_PERCPU_STATS +	int			nr_alloc;	/* # of allocations */ +	size_t			max_alloc_size; /* largest allocation size */ +#endif + +	struct list_head	list;		/* linked to pcpu_slot lists */ +	int			free_size;	/* free bytes in the chunk */ +	int			contig_hint;	/* max contiguous size hint */ +	void			*base_addr;	/* base address of this chunk */ + +	int			map_used;	/* # of map entries used before the sentry */ +	int			map_alloc;	/* # of map entries allocated */ +	int			*map;		/* allocation map */ +	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */ + +	void			*data;		/* chunk data */ +	int			first_free;	/* no free below this */ +	bool			immutable;	/* no [de]population allowed */ +	bool			has_reserved;	/* Indicates if chunk has reserved space +						   at the beginning. Reserved chunk will +						   contain reservation for static chunk. +						   Dynamic chunk will contain reservation +						   for static and reserved chunks. */ +	int			nr_populated;	/* # of populated pages */ +	unsigned long		populated[];	/* populated bitmap */ +}; + +extern spinlock_t pcpu_lock; + +extern struct list_head *pcpu_slot; +extern int pcpu_nr_slots; + +extern struct pcpu_chunk *pcpu_first_chunk; +extern struct pcpu_chunk *pcpu_reserved_chunk; + +#ifdef CONFIG_PERCPU_STATS + +#include <linux/spinlock.h> + +struct percpu_stats { +	u64 nr_alloc;		/* lifetime # of allocations */ +	u64 nr_dealloc;		/* lifetime # of deallocations */ +	u64 nr_cur_alloc;	/* current # of allocations */ +	u64 nr_max_alloc;	/* max # of live allocations */ +	u32 nr_chunks;		/* current # of live chunks */ +	u32 nr_max_chunks;	/* max # of live chunks */ +	size_t min_alloc_size;	/* min allocaiton size */ +	size_t max_alloc_size;	/* max allocation size */ +}; + +extern struct percpu_stats pcpu_stats; +extern struct pcpu_alloc_info pcpu_stats_ai; + +/* + * For debug purposes. We don't care about the flexible array. + */ +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +	memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); + +	/* initialize min_alloc_size to unit_size */ +	pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; +} + +/* + * pcpu_stats_area_alloc - increment area allocation stats + * @chunk: the location of the area being allocated + * @size: size of area to allocate in bytes + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +	lockdep_assert_held(&pcpu_lock); + +	pcpu_stats.nr_alloc++; +	pcpu_stats.nr_cur_alloc++; +	pcpu_stats.nr_max_alloc = +		max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); +	pcpu_stats.min_alloc_size = +		min(pcpu_stats.min_alloc_size, size); +	pcpu_stats.max_alloc_size = +		max(pcpu_stats.max_alloc_size, size); + +	chunk->nr_alloc++; +	chunk->max_alloc_size = max(chunk->max_alloc_size, size); +} + +/* + * pcpu_stats_area_dealloc - decrement allocation stats + * @chunk: the location of the area being deallocated + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +	lockdep_assert_held(&pcpu_lock); + +	pcpu_stats.nr_dealloc++; +	pcpu_stats.nr_cur_alloc--; + +	chunk->nr_alloc--; +} + +/* + * pcpu_stats_chunk_alloc - increment chunk stats + */ +static inline void pcpu_stats_chunk_alloc(void) +{ +	unsigned long flags; +	spin_lock_irqsave(&pcpu_lock, flags); + +	pcpu_stats.nr_chunks++; +	pcpu_stats.nr_max_chunks = +		max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); + +	spin_unlock_irqrestore(&pcpu_lock, flags); +} + +/* + * pcpu_stats_chunk_dealloc - decrement chunk stats + */ +static inline void pcpu_stats_chunk_dealloc(void) +{ +	unsigned long flags; +	spin_lock_irqsave(&pcpu_lock, flags); + +	pcpu_stats.nr_chunks--; + +	spin_unlock_irqrestore(&pcpu_lock, flags); +} + +#else + +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +} + +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +} + +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +} + +static inline void pcpu_stats_chunk_alloc(void) +{ +} + +static inline void pcpu_stats_chunk_dealloc(void) +{ +} + +#endif /* !CONFIG_PERCPU_STATS */ + +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d66911ff42d9..eb58aa4c0997 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void)  	pcpu_chunk_populated(chunk, 0, nr_pages);  	spin_unlock_irq(&pcpu_lock); +	pcpu_stats_chunk_alloc(); +	trace_percpu_create_chunk(chunk->base_addr); +  	return chunk;  } @@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)  {  	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; -	if (chunk && chunk->data) +	if (!chunk) +		return; + +	pcpu_stats_chunk_dealloc(); +	trace_percpu_destroy_chunk(chunk->base_addr); + +	if (chunk->data)  		__free_pages(chunk->data, order_base_2(nr_pages));  	pcpu_free_chunk(chunk);  } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c new file mode 100644 index 000000000000..03524a56eeff --- /dev/null +++ b/mm/percpu-stats.c @@ -0,0 +1,222 @@ +/* + * mm/percpu-debug.c + * + * Copyright (C) 2017		Facebook Inc. + * Copyright (C) 2017		Dennis Zhou <[email protected]> + * + * This file is released under the GPLv2. + * + * Prints statistics about the percpu allocator and backing chunks. + */ +#include <linux/debugfs.h> +#include <linux/list.h> +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/sort.h> +#include <linux/vmalloc.h> + +#include "percpu-internal.h" + +#define P(X, Y) \ +	seq_printf(m, "  %-24s: %8lld\n", X, (long long int)Y) + +struct percpu_stats pcpu_stats; +struct pcpu_alloc_info pcpu_stats_ai; + +static int cmpint(const void *a, const void *b) +{ +	return *(int *)a - *(int *)b; +} + +/* + * Iterates over all chunks to find the max # of map entries used. + */ +static int find_max_map_used(void) +{ +	struct pcpu_chunk *chunk; +	int slot, max_map_used; + +	max_map_used = 0; +	for (slot = 0; slot < pcpu_nr_slots; slot++) +		list_for_each_entry(chunk, &pcpu_slot[slot], list) +			max_map_used = max(max_map_used, chunk->map_used); + +	return max_map_used; +} + +/* + * Prints out chunk state. Fragmentation is considered between + * the beginning of the chunk to the last allocation. + */ +static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, +			    void *buffer) +{ +	int i, s_index, last_alloc, alloc_sign, as_len; +	int *alloc_sizes, *p; +	/* statistics */ +	int sum_frag = 0, max_frag = 0; +	int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; + +	alloc_sizes = buffer; +	s_index = chunk->has_reserved ? 1 : 0; + +	/* find last allocation */ +	last_alloc = -1; +	for (i = chunk->map_used - 1; i >= s_index; i--) { +		if (chunk->map[i] & 1) { +			last_alloc = i; +			break; +		} +	} + +	/* if the chunk is not empty - ignoring reserve */ +	if (last_alloc >= s_index) { +		as_len = last_alloc + 1 - s_index; + +		/* +		 * Iterate through chunk map computing size info. +		 * The first bit is overloaded to be a used flag. +		 * negative = free space, positive = allocated +		 */ +		for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { +			alloc_sign = (*p & 1) ? 1 : -1; +			alloc_sizes[i] = alloc_sign * +				((p[1] & ~1) - (p[0] & ~1)); +		} + +		sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + +		/* Iterate through the unallocated fragements. */ +		for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { +			sum_frag -= *p; +			max_frag = max(max_frag, -1 * (*p)); +		} + +		cur_min_alloc = alloc_sizes[i]; +		cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2]; +		cur_max_alloc = alloc_sizes[as_len - 1]; +	} + +	P("nr_alloc", chunk->nr_alloc); +	P("max_alloc_size", chunk->max_alloc_size); +	P("free_size", chunk->free_size); +	P("contig_hint", chunk->contig_hint); +	P("sum_frag", sum_frag); +	P("max_frag", max_frag); +	P("cur_min_alloc", cur_min_alloc); +	P("cur_med_alloc", cur_med_alloc); +	P("cur_max_alloc", cur_max_alloc); +	seq_putc(m, '\n'); +} + +static int percpu_stats_show(struct seq_file *m, void *v) +{ +	struct pcpu_chunk *chunk; +	int slot, max_map_used; +	void *buffer; + +alloc_buffer: +	spin_lock_irq(&pcpu_lock); +	max_map_used = find_max_map_used(); +	spin_unlock_irq(&pcpu_lock); + +	buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); +	if (!buffer) +		return -ENOMEM; + +	spin_lock_irq(&pcpu_lock); + +	/* if the buffer allocated earlier is too small */ +	if (max_map_used < find_max_map_used()) { +		spin_unlock_irq(&pcpu_lock); +		vfree(buffer); +		goto alloc_buffer; +	} + +#define PL(X) \ +	seq_printf(m, "  %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + +	seq_printf(m, +			"Percpu Memory Statistics\n" +			"Allocation Info:\n" +			"----------------------------------------\n"); +	PL(unit_size); +	PL(static_size); +	PL(reserved_size); +	PL(dyn_size); +	PL(atom_size); +	PL(alloc_size); +	seq_putc(m, '\n'); + +#undef PL + +#define PU(X) \ +	seq_printf(m, "  %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + +	seq_printf(m, +			"Global Stats:\n" +			"----------------------------------------\n"); +	PU(nr_alloc); +	PU(nr_dealloc); +	PU(nr_cur_alloc); +	PU(nr_max_alloc); +	PU(nr_chunks); +	PU(nr_max_chunks); +	PU(min_alloc_size); +	PU(max_alloc_size); +	seq_putc(m, '\n'); + +#undef PU + +	seq_printf(m, +			"Per Chunk Stats:\n" +			"----------------------------------------\n"); + +	if (pcpu_reserved_chunk) { +		seq_puts(m, "Chunk: <- Reserved Chunk\n"); +		chunk_map_stats(m, pcpu_reserved_chunk, buffer); +	} + +	for (slot = 0; slot < pcpu_nr_slots; slot++) { +		list_for_each_entry(chunk, &pcpu_slot[slot], list) { +			if (chunk == pcpu_first_chunk) { +				seq_puts(m, "Chunk: <- First Chunk\n"); +				chunk_map_stats(m, chunk, buffer); + + +			} else { +				seq_puts(m, "Chunk:\n"); +				chunk_map_stats(m, chunk, buffer); +			} + +		} +	} + +	spin_unlock_irq(&pcpu_lock); + +	vfree(buffer); + +	return 0; +} + +static int percpu_stats_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, percpu_stats_show, NULL); +} + +static const struct file_operations percpu_stats_fops = { +	.open		= percpu_stats_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static int __init init_percpu_stats_debugfs(void) +{ +	debugfs_create_file("percpu_stats", 0444, NULL, NULL, +			&percpu_stats_fops); + +	return 0; +} + +late_initcall(init_percpu_stats_debugfs); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9ac639499bd1..15dab691ea70 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void)  	chunk->data = vms;  	chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + +	pcpu_stats_chunk_alloc(); +	trace_percpu_create_chunk(chunk->base_addr); +  	return chunk;  }  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)  { -	if (chunk && chunk->data) +	if (!chunk) +		return; + +	pcpu_stats_chunk_dealloc(); +	trace_percpu_destroy_chunk(chunk->base_addr); + +	if (chunk->data)  		pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);  	pcpu_free_chunk(chunk);  } diff --git a/mm/percpu.c b/mm/percpu.c index e0aa8ae7bde7..bd4130a69bbc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,11 @@  #include <asm/tlbflush.h>  #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/percpu.h> + +#include "percpu-internal.h" +  #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */  #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */  #define PCPU_ATOMIC_MAP_MARGIN_LOW	32 @@ -103,53 +108,35 @@  #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)  #endif	/* CONFIG_SMP */ -struct pcpu_chunk { -	struct list_head	list;		/* linked to pcpu_slot lists */ -	int			free_size;	/* free bytes in the chunk */ -	int			contig_hint;	/* max contiguous size hint */ -	void			*base_addr;	/* base address of this chunk */ - -	int			map_used;	/* # of map entries used before the sentry */ -	int			map_alloc;	/* # of map entries allocated */ -	int			*map;		/* allocation map */ -	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */ - -	void			*data;		/* chunk data */ -	int			first_free;	/* no free below this */ -	bool			immutable;	/* no [de]population allowed */ -	int			nr_populated;	/* # of populated pages */ -	unsigned long		populated[];	/* populated bitmap */ -}; - -static int pcpu_unit_pages __read_mostly; -static int pcpu_unit_size __read_mostly; -static int pcpu_nr_units __read_mostly; -static int pcpu_atom_size __read_mostly; -static int pcpu_nr_slots __read_mostly; -static size_t pcpu_chunk_struct_size __read_mostly; +static int pcpu_unit_pages __ro_after_init; +static int pcpu_unit_size __ro_after_init; +static int pcpu_nr_units __ro_after_init; +static int pcpu_atom_size __ro_after_init; +int pcpu_nr_slots __ro_after_init; +static size_t pcpu_chunk_struct_size __ro_after_init;  /* cpus with the lowest and highest unit addresses */ -static unsigned int pcpu_low_unit_cpu __read_mostly; -static unsigned int pcpu_high_unit_cpu __read_mostly; +static unsigned int pcpu_low_unit_cpu __ro_after_init; +static unsigned int pcpu_high_unit_cpu __ro_after_init;  /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr __read_mostly; +void *pcpu_base_addr __ro_after_init;  EXPORT_SYMBOL_GPL(pcpu_base_addr); -static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */ -const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */ +static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */  /* group information, used for vm allocation */ -static int pcpu_nr_groups __read_mostly; -static const unsigned long *pcpu_group_offsets __read_mostly; -static const size_t *pcpu_group_sizes __read_mostly; +static int pcpu_nr_groups __ro_after_init; +static const unsigned long *pcpu_group_offsets __ro_after_init; +static const size_t *pcpu_group_sizes __ro_after_init;  /*   * The first chunk which always exists.  Note that unlike other   * chunks, this one can be allocated and mapped in several different   * ways and thus often doesn't live in the vmalloc area.   */ -static struct pcpu_chunk *pcpu_first_chunk; +struct pcpu_chunk *pcpu_first_chunk __ro_after_init;  /*   * Optional reserved chunk.  This chunk reserves part of the first @@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk;   * area doesn't exist, the following variables contain NULL and 0   * respectively.   */ -static struct pcpu_chunk *pcpu_reserved_chunk; -static int pcpu_reserved_chunk_limit; +struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; +static int pcpu_reserved_chunk_limit __ro_after_init; -static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */ +DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */  static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */ -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */  /* chunks which need their map areas extended, protected by pcpu_lock */  static LIST_HEAD(pcpu_map_extend_chunks); @@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,  	int to_free = 0;  	int *p; +	lockdep_assert_held(&pcpu_lock); +	pcpu_stats_area_dealloc(chunk); +  	freeme |= 1;	/* we are searching for <given offset, in use> pair */  	i = 0; @@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)  	chunk->map[0] = 0;  	chunk->map[1] = pcpu_unit_size | 1;  	chunk->map_used = 1; +	chunk->has_reserved = false;  	INIT_LIST_HEAD(&chunk->list);  	INIT_LIST_HEAD(&chunk->map_extend_list); @@ -965,8 +956,10 @@ restart:  	 * tasks to create chunks simultaneously.  Serialize and create iff  	 * there's still no empty chunk after grabbing the mutex.  	 */ -	if (is_atomic) +	if (is_atomic) { +		err = "atomic alloc failed, no space left";  		goto fail; +	}  	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {  		chunk = pcpu_create_chunk(); @@ -984,6 +977,7 @@ restart:  	goto restart;  area_found: +	pcpu_stats_area_alloc(chunk, size);  	spin_unlock_irqrestore(&pcpu_lock, flags);  	/* populate if not all pages are already there */ @@ -1026,11 +1020,17 @@ area_found:  	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);  	kmemleak_alloc_percpu(ptr, size, gfp); + +	trace_percpu_alloc_percpu(reserved, is_atomic, size, align, +			chunk->base_addr, off, ptr); +  	return ptr;  fail_unlock:  	spin_unlock_irqrestore(&pcpu_lock, flags);  fail: +	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); +  	if (!is_atomic && warn_limit) {  		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",  			size, align, is_atomic, err); @@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr)  			}  	} +	trace_percpu_free_percpu(chunk->base_addr, off, ptr); +  	spin_unlock_irqrestore(&pcpu_lock, flags);  }  EXPORT_SYMBOL_GPL(free_percpu); @@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,  	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +  		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); +	pcpu_stats_save_ai(ai); +  	/*  	 * Allocate chunk slots.  The additional last slot is for  	 * empty chunks. @@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,  	if (schunk->free_size)  		schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;  	schunk->map[schunk->map_used] |= 1; +	schunk->has_reserved = true;  	/* init dynamic chunk if necessary */  	if (dyn_size) { @@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,  		dchunk->map[1] = pcpu_reserved_chunk_limit;  		dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;  		dchunk->map_used = 2; +		dchunk->has_reserved = true;  	}  	/* link the first chunk in */ @@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,  		pcpu_count_occupied_pages(pcpu_first_chunk, 1);  	pcpu_chunk_relocate(pcpu_first_chunk, -1); +	pcpu_stats_chunk_alloc(); +	trace_percpu_create_chunk(base_addr); +  	/* we're done */  	pcpu_base_addr = base_addr;  	return 0; diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..ced14f1af6dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)  void try_to_unmap_flush(void)  {  	struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; -	int cpu;  	if (!tlb_ubc->flush_required)  		return; -	cpu = get_cpu(); - -	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { -		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); -		local_flush_tlb(); -		trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); -	} - -	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) -		flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); -	cpumask_clear(&tlb_ubc->cpumask); +	arch_tlbbatch_flush(&tlb_ubc->arch);  	tlb_ubc->flush_required = false;  	tlb_ubc->writable = false; -	put_cpu();  }  /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)  {  	struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; -	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); +	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);  	tlb_ubc->flush_required = true;  	/* @@ -1157,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound)  		if (!atomic_inc_and_test(&page->_mapcount))  			goto out;  	} -	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); -	mod_memcg_page_state(page, NR_FILE_MAPPED, nr); +	__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);  out:  	unlock_page_memcg(page);  } @@ -1193,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound)  	}  	/* -	 * We use the irq-unsafe __{inc|mod}_zone_page_state because +	 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because  	 * these counters are not modified in interrupt context, and  	 * pte lock(a spinlock) is held, which implies preemption disabled.  	 */ -	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); -	mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); +	__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);  	if (unlikely(PageMlocked(page)))  		clear_page_mlock(page); @@ -1379,15 +1365,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,  		update_hiwater_rss(mm);  		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { +			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));  			if (PageHuge(page)) {  				int nr = 1 << compound_order(page);  				hugetlb_count_sub(nr, mm); +				set_huge_swap_pte_at(mm, address, +						     pvmw.pte, pteval, +						     vma_mmu_pagesize(vma));  			} else {  				dec_mm_counter(mm, mm_counter(page)); +				set_pte_at(mm, address, pvmw.pte, pteval);  			} -			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); -			set_pte_at(mm, address, pvmw.pte, pteval);  		} else if (pte_unused(pteval)) {  			/*  			 * The guest indicated that the page content is of no diff --git a/mm/shmem.c b/mm/shmem.c index e67d6ba4e98e..9418f5a9bc46 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt;  #include <uapi/linux/memfd.h>  #include <linux/userfaultfd_k.h>  #include <linux/rmap.h> +#include <linux/uuid.h>  #include <linux/uaccess.h>  #include <asm/pgtable.h> @@ -1290,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)  		SetPageUptodate(page);  	} -	swap = get_swap_page(); +	swap = get_swap_page(page);  	if (!swap.val)  		goto redirty; @@ -1326,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)  	mutex_unlock(&shmem_swaplist_mutex);  free_swap: -	swapcache_free(swap); +	put_swap_page(page, swap);  redirty:  	set_page_dirty(page);  	if (wbc->for_reclaim) @@ -1645,8 +1646,7 @@ repeat:  			if (fault_type) {  				*fault_type |= VM_FAULT_MAJOR;  				count_vm_event(PGMAJFAULT); -				mem_cgroup_count_vm_event(charge_mm, -							  PGMAJFAULT); +				count_memcg_event_mm(charge_mm, PGMAJFAULT);  			}  			/* Here we actually start the io */  			page = shmem_swapin(swap, gfp, info, index); @@ -1902,10 +1902,10 @@ unlock:   * entry unconditionally - even if something else had already woken the   * target.   */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)  {  	int ret = default_wake_function(wait, mode, sync, key); -	list_del_init(&wait->task_list); +	list_del_init(&wait->entry);  	return ret;  } @@ -2840,7 +2840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,  		spin_lock(&inode->i_lock);  		inode->i_private = NULL;  		wake_up_all(&shmem_falloc_waitq); -		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); +		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));  		spin_unlock(&inode->i_lock);  		error = 0;  		goto out; @@ -3761,6 +3761,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)  #ifdef CONFIG_TMPFS_POSIX_ACL  	sb->s_flags |= MS_POSIXACL;  #endif +	uuid_gen(&sb->s_uuid);  	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);  	if (!inode) diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..04dec48c3ed7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,  	nr_pages = (1 << cachep->gfporder);  	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) -		add_zone_page_state(page_zone(page), -			NR_SLAB_RECLAIMABLE, nr_pages); +		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);  	else -		add_zone_page_state(page_zone(page), -			NR_SLAB_UNRECLAIMABLE, nr_pages); +		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);  	__SetPageSlab(page);  	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ @@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)  	kmemcheck_free_shadow(page, order);  	if (cachep->flags & SLAB_RECLAIM_ACCOUNT) -		sub_zone_page_state(page_zone(page), -				NR_SLAB_RECLAIMABLE, nr_freed); +		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);  	else -		sub_zone_page_state(page_zone(page), -				NR_SLAB_UNRECLAIMABLE, nr_freed); +		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);  	BUG_ON(!PageSlab(page));  	__ClearPageSlabPfmemalloc(page); @@ -2040,17 +2036,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)  	 * unaligned accesses for some archs when redzoning is used, and makes  	 * sure any on-slab bufctl's are also correctly aligned.  	 */ -	if (size & (BYTES_PER_WORD - 1)) { -		size += (BYTES_PER_WORD - 1); -		size &= ~(BYTES_PER_WORD - 1); -	} +	size = ALIGN(size, BYTES_PER_WORD);  	if (flags & SLAB_RED_ZONE) {  		ralign = REDZONE_ALIGN;  		/* If redzoning, ensure that the second redzone is suitably  		 * aligned, by adjusting the object size accordingly. */ -		size += REDZONE_ALIGN - 1; -		size &= ~(REDZONE_ALIGN - 1); +		size = ALIGN(size, REDZONE_ALIGN);  	}  	/* 3) caller mandated alignment */ diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..6885e1192ec5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page,  					     gfp_t gfp, int order,  					     struct kmem_cache *s)  { -	int ret; -  	if (!memcg_kmem_enabled())  		return 0;  	if (is_root_cache(s))  		return 0; - -	ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); -	if (ret) -		return ret; - -	memcg_kmem_update_page_stat(page, -			(s->flags & SLAB_RECLAIM_ACCOUNT) ? -			MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, -			1 << order); -	return 0; +	return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);  }  static __always_inline void memcg_uncharge_slab(struct page *page, int order, @@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,  {  	if (!memcg_kmem_enabled())  		return; - -	memcg_kmem_update_page_stat(page, -			(s->flags & SLAB_RECLAIM_ACCOUNT) ? -			MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, -			-(1 << order));  	memcg_kmem_uncharge(page, order);  } diff --git a/mm/slab_common.c b/mm/slab_common.c index 01a0fe2eb332..904a83be82de 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,  /*   * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.)   */ -static int slab_nomerge; +static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);  static int __init setup_slab_nomerge(char *str)  { -	slab_nomerge = 1; +	slab_nomerge = true;  	return 1;  } diff --git a/mm/slub.c b/mm/slub.c index 7449593fca72..1d3f9835f4ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out:  	if (!page)  		return NULL; -	mod_zone_page_state(page_zone(page), +	mod_lruvec_page_state(page,  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,  		1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)  	kmemcheck_free_shadow(page, compound_order(page)); -	mod_zone_page_state(page_zone(page), +	mod_lruvec_page_state(page,  		(s->flags & SLAB_RECLAIM_ACCOUNT) ?  		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,  		-pages); @@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,  			stat(s, CPU_PARTIAL_NODE);  		}  		if (!kmem_cache_has_cpu_partial(s) -			|| available > s->cpu_partial / 2) +			|| available > slub_cpu_partial(s) / 2)  			break;  	} @@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)   * Remove the cpu slab   */  static void deactivate_slab(struct kmem_cache *s, struct page *page, -				void *freelist) +				void *freelist, struct kmem_cache_cpu *c)  {  	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };  	struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2132,6 +2132,9 @@ redo:  		discard_slab(s, page);  		stat(s, FREE_SLAB);  	} + +	c->page = NULL; +	c->freelist = NULL;  }  /* @@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)  {  	stat(s, CPUSLAB_FLUSH); -	deactivate_slab(s, c->page, c->freelist); +	deactivate_slab(s, c->page, c->freelist, c);  	c->tid = next_tid(c->tid); -	c->page = NULL; -	c->freelist = NULL;  }  /* @@ -2302,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info)  	struct kmem_cache *s = info;  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); -	return c->page || c->partial; +	return c->page || slub_percpu_partial(c);  }  static void flush_all(struct kmem_cache *s) @@ -2521,9 +2522,7 @@ redo:  		if (unlikely(!node_match(page, searchnode))) {  			stat(s, ALLOC_NODE_MISMATCH); -			deactivate_slab(s, page, c->freelist); -			c->page = NULL; -			c->freelist = NULL; +			deactivate_slab(s, page, c->freelist, c);  			goto new_slab;  		}  	} @@ -2534,9 +2533,7 @@ redo:  	 * information when the page leaves the per-cpu allocator  	 */  	if (unlikely(!pfmemalloc_match(page, gfpflags))) { -		deactivate_slab(s, page, c->freelist); -		c->page = NULL; -		c->freelist = NULL; +		deactivate_slab(s, page, c->freelist, c);  		goto new_slab;  	} @@ -2568,11 +2565,10 @@ load_freelist:  new_slab: -	if (c->partial) { -		page = c->page = c->partial; -		c->partial = page->next; +	if (slub_percpu_partial(c)) { +		page = c->page = slub_percpu_partial(c); +		slub_set_percpu_partial(c, page);  		stat(s, CPU_PARTIAL_ALLOC); -		c->freelist = NULL;  		goto redo;  	} @@ -2592,9 +2588,7 @@ new_slab:  			!alloc_debug_processing(s, page, freelist, addr))  		goto new_slab;	/* Slab failed checks. Next slab needed */ -	deactivate_slab(s, page, get_freepointer(s, freelist)); -	c->page = NULL; -	c->freelist = NULL; +	deactivate_slab(s, page, get_freepointer(s, freelist), c);  	return freelist;  } @@ -3410,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)  	s->min_partial = min;  } +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL +	/* +	 * cpu_partial determined the maximum number of objects kept in the +	 * per cpu partial lists of a processor. +	 * +	 * Per cpu partial lists mainly contain slabs that just have one +	 * object freed. If they are used for allocation then they can be +	 * filled up again with minimal effort. The slab will never hit the +	 * per node partial lists and therefore no locking will be required. +	 * +	 * This setting also determines +	 * +	 * A) The number of objects from per cpu partial slabs dumped to the +	 *    per node list when we reach the limit. +	 * B) The number of objects in cpu partial slabs to extract from the +	 *    per node list when we run out of per cpu objects. We only fetch +	 *    50% to keep some capacity around for frees. +	 */ +	if (!kmem_cache_has_cpu_partial(s)) +		s->cpu_partial = 0; +	else if (s->size >= PAGE_SIZE) +		s->cpu_partial = 2; +	else if (s->size >= 1024) +		s->cpu_partial = 6; +	else if (s->size >= 256) +		s->cpu_partial = 13; +	else +		s->cpu_partial = 30; +#endif +} +  /*   * calculate_sizes() determines the order and the distribution of data within   * a slab object. @@ -3568,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)  	 */  	set_min_partial(s, ilog2(s->size) / 2); -	/* -	 * cpu_partial determined the maximum number of objects kept in the -	 * per cpu partial lists of a processor. -	 * -	 * Per cpu partial lists mainly contain slabs that just have one -	 * object freed. If they are used for allocation then they can be -	 * filled up again with minimal effort. The slab will never hit the -	 * per node partial lists and therefore no locking will be required. -	 * -	 * This setting also determines -	 * -	 * A) The number of objects from per cpu partial slabs dumped to the -	 *    per node list when we reach the limit. -	 * B) The number of objects in cpu partial slabs to extract from the -	 *    per node list when we run out of per cpu objects. We only fetch -	 *    50% to keep some capacity around for frees. -	 */ -	if (!kmem_cache_has_cpu_partial(s)) -		s->cpu_partial = 0; -	else if (s->size >= PAGE_SIZE) -		s->cpu_partial = 2; -	else if (s->size >= 1024) -		s->cpu_partial = 6; -	else if (s->size >= 256) -		s->cpu_partial = 13; -	else -		s->cpu_partial = 30; +	set_cpu_partial(s);  #ifdef CONFIG_NUMA  	s->remote_node_defrag_ratio = 1000; @@ -3981,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)  	 * Disable empty slabs caching. Used to avoid pinning offline  	 * memory cgroups by kmem pages that can be freed.  	 */ -	s->cpu_partial = 0; +	slub_set_cpu_partial(s, 0);  	s->min_partial = 0;  	/* @@ -4760,7 +4761,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,  			total += x;  			nodes[node] += x; -			page = READ_ONCE(c->partial); +			page = slub_percpu_partial_read_once(c);  			if (page) {  				node = page_to_nid(page);  				if (flags & SO_TOTAL) @@ -4921,7 +4922,7 @@ SLAB_ATTR(min_partial);  static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)  { -	return sprintf(buf, "%u\n", s->cpu_partial); +	return sprintf(buf, "%u\n", slub_cpu_partial(s));  }  static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -4936,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,  	if (objects && !kmem_cache_has_cpu_partial(s))  		return -EINVAL; -	s->cpu_partial = objects; +	slub_set_cpu_partial(s, objects);  	flush_all(s);  	return length;  } @@ -4988,7 +4989,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)  	int len;  	for_each_online_cpu(cpu) { -		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; +		struct page *page; + +		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));  		if (page) {  			pages += page->pages; @@ -5000,7 +5003,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)  #ifdef CONFIG_SMP  	for_each_online_cpu(cpu) { -		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; +		struct page *page; + +		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));  		if (page && len < PAGE_SIZE - 20)  			len += sprintf(buf + len, " C%d=%d(%d)", cpu, @@ -5625,6 +5630,28 @@ static char *create_unique_id(struct kmem_cache *s)  	return name;  } +static void sysfs_slab_remove_workfn(struct work_struct *work) +{ +	struct kmem_cache *s = +		container_of(work, struct kmem_cache, kobj_remove_work); + +	if (!s->kobj.state_in_sysfs) +		/* +		 * For a memcg cache, this may be called during +		 * deactivation and again on shutdown.  Remove only once. +		 * A cache is never shut down before deactivation is +		 * complete, so no need to worry about synchronization. +		 */ +		return; + +#ifdef CONFIG_MEMCG +	kset_unregister(s->memcg_kset); +#endif +	kobject_uevent(&s->kobj, KOBJ_REMOVE); +	kobject_del(&s->kobj); +	kobject_put(&s->kobj); +} +  static int sysfs_slab_add(struct kmem_cache *s)  {  	int err; @@ -5632,6 +5659,8 @@ static int sysfs_slab_add(struct kmem_cache *s)  	struct kset *kset = cache_kset(s);  	int unmergeable = slab_unmergeable(s); +	INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn); +  	if (!kset) {  		kobject_init(&s->kobj, &slab_ktype);  		return 0; @@ -5695,20 +5724,8 @@ static void sysfs_slab_remove(struct kmem_cache *s)  		 */  		return; -	if (!s->kobj.state_in_sysfs) -		/* -		 * For a memcg cache, this may be called during -		 * deactivation and again on shutdown.  Remove only once. -		 * A cache is never shut down before deactivation is -		 * complete, so no need to worry about synchronization. -		 */ -		return; - -#ifdef CONFIG_MEMCG -	kset_unregister(s->memcg_kset); -#endif -	kobject_uevent(&s->kobj, KOBJ_REMOVE); -	kobject_del(&s->kobj); +	kobject_get(&s->kobj); +	schedule_work(&s->kobj_remove_work);  }  void sysfs_slab_release(struct kmem_cache *s) diff --git a/mm/sparse.c b/mm/sparse.c index 6903c8fc3085..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,  	}  } +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each.  But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +int __highest_present_section_nr; +static void section_mark_present(struct mem_section *ms) +{ +	int section_nr = __section_nr(ms); + +	if (section_nr > __highest_present_section_nr) +		__highest_present_section_nr = section_nr; + +	ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +static inline int next_present_section_nr(int section_nr) +{ +	do { +		section_nr++; +		if (present_section_nr(section_nr)) +			return section_nr; +	} while ((section_nr < NR_MEM_SECTIONS) && +		 (section_nr <= __highest_present_section_nr)); + +	return -1; +} +#define for_each_present_section_nr(start, section_nr)		\ +	for (section_nr = next_present_section_nr(start-1);	\ +	     ((section_nr >= 0) &&				\ +	      (section_nr < NR_MEM_SECTIONS) &&			\ +	      (section_nr <= __highest_present_section_nr));	\ +	     section_nr = next_present_section_nr(section_nr)) +  /* Record a memory area against a node. */  void __init memory_present(int nid, unsigned long start, unsigned long end)  { @@ -183,9 +221,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)  		set_section_nid(section, nid);  		ms = __nr_to_section(section); -		if (!ms->section_mem_map) +		if (!ms->section_mem_map) {  			ms->section_mem_map = sparse_encode_early_nid(nid) | -							SECTION_MARKED_PRESENT; +							SECTION_IS_ONLINE; +			section_mark_present(ms); +		}  	}  } @@ -476,23 +516,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)  	int nodeid_begin = 0;  	unsigned long pnum_begin = 0; -	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { +	for_each_present_section_nr(0, pnum) {  		struct mem_section *ms; -		if (!present_section_nr(pnum)) -			continue;  		ms = __nr_to_section(pnum);  		nodeid_begin = sparse_early_nid(ms);  		pnum_begin = pnum;  		break;  	}  	map_count = 1; -	for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { +	for_each_present_section_nr(pnum_begin + 1, pnum) {  		struct mem_section *ms;  		int nodeid; -		if (!present_section_nr(pnum)) -			continue;  		ms = __nr_to_section(pnum);  		nodeid = sparse_early_nid(ms);  		if (nodeid == nodeid_begin) { @@ -561,10 +597,7 @@ void __init sparse_init(void)  							(void *)map_map);  #endif -	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { -		if (!present_section_nr(pnum)) -			continue; - +	for_each_present_section_nr(0, pnum) {  		usemap = usemap_map[pnum];  		if (!usemap)  			continue; @@ -590,6 +623,48 @@ void __init sparse_init(void)  }  #ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ +	unsigned long pfn; + +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { +		unsigned long section_nr = pfn_to_section_nr(start_pfn); +		struct mem_section *ms; + +		/* onlining code should never touch invalid ranges */ +		if (WARN_ON(!valid_section_nr(section_nr))) +			continue; + +		ms = __nr_to_section(section_nr); +		ms->section_mem_map |= SECTION_IS_ONLINE; +	} +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* Mark all memory sections within the pfn range as online */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ +	unsigned long pfn; + +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { +		unsigned long section_nr = pfn_to_section_nr(start_pfn); +		struct mem_section *ms; + +		/* +		 * TODO this needs some double checking. Offlining code makes +		 * sure to check pfn_valid but those checks might be just bogus +		 */ +		if (WARN_ON(!valid_section_nr(section_nr))) +			continue; + +		ms = __nr_to_section(section_nr); +		ms->section_mem_map &= ~SECTION_IS_ONLINE; +	} +} +#endif +  #ifdef CONFIG_SPARSEMEM_VMEMMAP  static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)  { @@ -686,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)   * set.  If this is <=0, then that means that the passed-in   * map was not consumed and must be freed.   */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)  {  	unsigned long section_nr = pfn_to_section_nr(start_pfn); -	struct pglist_data *pgdat = zone->zone_pgdat;  	struct mem_section *ms;  	struct page *memmap;  	unsigned long *usemap; @@ -722,7 +796,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)  	memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); -	ms->section_mem_map |= SECTION_MARKED_PRESENT; +	section_mark_present(ms);  	ret = sparse_init_one_section(ms, section_nr, memmap, usemap); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..4f44dbd7f780 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,  		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);  		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); +		count_memcg_page_event(page, PGLAZYFREE);  		update_page_reclaim_stat(lruvec, 1, 0);  	}  } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index ac6318a064d3..fcd2740f4ed7 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type)  		if (!page)  			goto not_enough_page;  		ctrl->map[idx] = page; + +		if (!(idx % SWAP_CLUSTER_MAX)) +			cond_resched();  	}  	return 0;  not_enough_page: @@ -58,21 +61,27 @@ not_enough_page:  	return -ENOMEM;  } +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, +						pgoff_t offset) +{ +	struct page *mappage; +	struct swap_cgroup *sc; + +	mappage = ctrl->map[offset / SC_PER_PAGE]; +	sc = page_address(mappage); +	return sc + offset % SC_PER_PAGE; +} +  static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,  					struct swap_cgroup_ctrl **ctrlp)  {  	pgoff_t offset = swp_offset(ent);  	struct swap_cgroup_ctrl *ctrl; -	struct page *mappage; -	struct swap_cgroup *sc;  	ctrl = &swap_cgroup_ctrl[swp_type(ent)];  	if (ctrlp)  		*ctrlp = ctrl; - -	mappage = ctrl->map[offset / SC_PER_PAGE]; -	sc = page_address(mappage); -	return sc + offset % SC_PER_PAGE; +	return __lookup_swap_cgroup(ctrl, offset);  }  /** @@ -105,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,  }  /** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into   * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded   *   * Returns old value at success, 0 at failure.   * (Of course, old value can be 0.)   */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, +				  unsigned int nr_ents)  {  	struct swap_cgroup_ctrl *ctrl;  	struct swap_cgroup *sc;  	unsigned short old;  	unsigned long flags; +	pgoff_t offset = swp_offset(ent); +	pgoff_t end = offset + nr_ents;  	sc = lookup_swap_cgroup(ent, &ctrl);  	spin_lock_irqsave(&ctrl->lock, flags);  	old = sc->id; -	sc->id = id; +	for (;;) { +		VM_BUG_ON(sc->id != old); +		sc->id = id; +		offset++; +		if (offset == end) +			break; +		if (offset % SC_PER_PAGE) +			sc++; +		else +			sc = __lookup_swap_cgroup(ctrl, offset); +	}  	spin_unlock_irqrestore(&ctrl->lock, flags);  	return old; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 58f6c78f1dad..90c1032a8ac3 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)  	cache->cur = 0;  	if (swap_slot_cache_active) -		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); +		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, +					   cache->slots);  	return cache->nr;  } @@ -301,11 +302,19 @@ direct_free:  	return 0;  } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct page *page)  {  	swp_entry_t entry, *pentry;  	struct swap_slots_cache *cache; +	entry.val = 0; + +	if (PageTransHuge(page)) { +		if (IS_ENABLED(CONFIG_THP_SWAP)) +			get_swap_pages(1, true, &entry); +		return entry; +	} +  	/*  	 * Preemption is allowed here, because we may sleep  	 * in refill_swap_slots_cache().  But it is safe, because @@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void)  	 */  	cache = raw_cpu_ptr(&swp_slots); -	entry.val = 0;  	if (check_cache_active()) {  		mutex_lock(&cache->alloc_lock);  		if (cache->slots) { @@ -337,7 +345,7 @@ repeat:  			return entry;  	} -	get_swap_pages(1, &entry); +	get_swap_pages(1, false, &entry);  	return entry;  } diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..9c71b6b2562f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@  #include <linux/migrate.h>  #include <linux/vmalloc.h>  #include <linux/swap_slots.h> +#include <linux/huge_mm.h>  #include <asm/pgtable.h> @@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];  static unsigned int nr_swapper_spaces[MAX_SWAPFILES];  #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)  static struct {  	unsigned long add_total; @@ -90,39 +92,46 @@ void show_swap_cache_info(void)   */  int __add_to_swap_cache(struct page *page, swp_entry_t entry)  { -	int error; +	int error, i, nr = hpage_nr_pages(page);  	struct address_space *address_space; +	pgoff_t idx = swp_offset(entry);  	VM_BUG_ON_PAGE(!PageLocked(page), page);  	VM_BUG_ON_PAGE(PageSwapCache(page), page);  	VM_BUG_ON_PAGE(!PageSwapBacked(page), page); -	get_page(page); +	page_ref_add(page, nr);  	SetPageSwapCache(page); -	set_page_private(page, entry.val);  	address_space = swap_address_space(entry);  	spin_lock_irq(&address_space->tree_lock); -	error = radix_tree_insert(&address_space->page_tree, -				  swp_offset(entry), page); -	if (likely(!error)) { -		address_space->nrpages++; -		__inc_node_page_state(page, NR_FILE_PAGES); -		INC_CACHE_INFO(add_total); +	for (i = 0; i < nr; i++) { +		set_page_private(page + i, entry.val + i); +		error = radix_tree_insert(&address_space->page_tree, +					  idx + i, page + i); +		if (unlikely(error)) +			break;  	} -	spin_unlock_irq(&address_space->tree_lock); - -	if (unlikely(error)) { +	if (likely(!error)) { +		address_space->nrpages += nr; +		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); +		ADD_CACHE_INFO(add_total, nr); +	} else {  		/*  		 * Only the context which have set SWAP_HAS_CACHE flag  		 * would call add_to_swap_cache().  		 * So add_to_swap_cache() doesn't returns -EEXIST.  		 */  		VM_BUG_ON(error == -EEXIST); -		set_page_private(page, 0UL); +		set_page_private(page + i, 0UL); +		while (i--) { +			radix_tree_delete(&address_space->page_tree, idx + i); +			set_page_private(page + i, 0UL); +		}  		ClearPageSwapCache(page); -		put_page(page); +		page_ref_sub(page, nr);  	} +	spin_unlock_irq(&address_space->tree_lock);  	return error;  } @@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)  {  	int error; -	error = radix_tree_maybe_preload(gfp_mask); +	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));  	if (!error) {  		error = __add_to_swap_cache(page, entry);  		radix_tree_preload_end(); @@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)   */  void __delete_from_swap_cache(struct page *page)  { -	swp_entry_t entry;  	struct address_space *address_space; +	int i, nr = hpage_nr_pages(page); +	swp_entry_t entry; +	pgoff_t idx;  	VM_BUG_ON_PAGE(!PageLocked(page), page);  	VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page)  	entry.val = page_private(page);  	address_space = swap_address_space(entry); -	radix_tree_delete(&address_space->page_tree, swp_offset(entry)); -	set_page_private(page, 0); +	idx = swp_offset(entry); +	for (i = 0; i < nr; i++) { +		radix_tree_delete(&address_space->page_tree, idx + i); +		set_page_private(page + i, 0); +	}  	ClearPageSwapCache(page); -	address_space->nrpages--; -	__dec_node_page_state(page, NR_FILE_PAGES); -	INC_CACHE_INFO(del_total); +	address_space->nrpages -= nr; +	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); +	ADD_CACHE_INFO(del_total, nr);  }  /** @@ -170,7 +184,7 @@ void __delete_from_swap_cache(struct page *page)   * Allocate swap space for the page and add the page to the   * swap cache.  Caller needs to hold the page lock.    */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page)  {  	swp_entry_t entry;  	int err; @@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)  	VM_BUG_ON_PAGE(!PageLocked(page), page);  	VM_BUG_ON_PAGE(!PageUptodate(page), page); -	entry = get_swap_page(); +	entry = get_swap_page(page);  	if (!entry.val)  		return 0; -	if (mem_cgroup_try_charge_swap(page, entry)) { -		swapcache_free(entry); -		return 0; -	} - -	if (unlikely(PageTransHuge(page))) -		if (unlikely(split_huge_page_to_list(page, list))) { -			swapcache_free(entry); -			return 0; -		} +	if (mem_cgroup_try_charge_swap(page, entry)) +		goto fail;  	/*  	 * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +212,19 @@ int add_to_swap(struct page *page, struct list_head *list)  	 */  	err = add_to_swap_cache(page, entry,  			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - -	if (!err) { -		return 1; -	} else {	/* -ENOMEM radix-tree allocation failure */ +	/* -ENOMEM radix-tree allocation failure */ +	if (err)  		/*  		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely  		 * clear SWAP_HAS_CACHE flag.  		 */ -		swapcache_free(entry); -		return 0; -	} +		goto fail; + +	return 1; + +fail: +	put_swap_page(page, entry); +	return 0;  }  /* @@ -237,8 +245,8 @@ void delete_from_swap_cache(struct page *page)  	__delete_from_swap_cache(page);  	spin_unlock_irq(&address_space->tree_lock); -	swapcache_free(entry); -	put_page(page); +	put_swap_page(page, entry); +	page_ref_sub(page, hpage_nr_pages(page));  }  /*  @@ -295,7 +303,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)  	page = find_get_page(swap_address_space(entry), swp_offset(entry)); -	if (page) { +	if (page && likely(!PageTransCompound(page))) {  		INC_CACHE_INFO(find_success);  		if (TestClearPageReadahead(page))  			atomic_inc(&swapin_readahead_hits); @@ -389,7 +397,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely  		 * clear SWAP_HAS_CACHE flag.  		 */ -		swapcache_free(entry); +		put_swap_page(new_page, entry);  	} while (err != -ENOMEM);  	if (new_page) @@ -506,7 +514,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,  						gfp_mask, vma, addr);  		if (!page)  			continue; -		if (offset != entry_offset) +		if (offset != entry_offset && likely(!PageTransCompound(page)))  			SetPageReadahead(page);  		put_page(page);  	} diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f6cba1b6632..811d90e1c929 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@  #include <linux/swapfile.h>  #include <linux/export.h>  #include <linux/swap_slots.h> +#include <linux/sort.h>  #include <asm/pgtable.h>  #include <asm/tlbflush.h> @@ -199,7 +200,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,  	}  } +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER	HPAGE_PMD_NR +#else  #define SWAPFILE_CLUSTER	256 +#endif  #define LATENCY_LIMIT		256  static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -374,6 +379,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,  	schedule_work(&si->discard_work);  } +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ +	struct swap_cluster_info *ci = si->cluster_info; + +	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); +	cluster_list_add_tail(&si->free_clusters, ci, idx); +} +  /*   * Doing discard actually. After a cluster discard is finished, the cluster   * will be added to free cluster list. caller should hold si->lock. @@ -394,10 +407,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)  		spin_lock(&si->lock);  		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); -		cluster_set_flag(ci, CLUSTER_FLAG_FREE); -		unlock_cluster(ci); -		cluster_list_add_tail(&si->free_clusters, info, idx); -		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); +		__free_cluster(si, idx);  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,  				0, SWAPFILE_CLUSTER);  		unlock_cluster(ci); @@ -415,6 +425,34 @@ static void swap_discard_work(struct work_struct *work)  	spin_unlock(&si->lock);  } +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ +	struct swap_cluster_info *ci = si->cluster_info; + +	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); +	cluster_list_del_first(&si->free_clusters, ci); +	cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ +	struct swap_cluster_info *ci = si->cluster_info + idx; + +	VM_BUG_ON(cluster_count(ci) != 0); +	/* +	 * If the swap is discardable, prepare discard the cluster +	 * instead of free it immediately. The cluster will be freed +	 * after discard. +	 */ +	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == +	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) { +		swap_cluster_schedule_discard(si, idx); +		return; +	} + +	__free_cluster(si, idx); +} +  /*   * The cluster corresponding to page_nr will be used. The cluster will be   * removed from free cluster list and its usage counter will be increased. @@ -426,11 +464,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,  	if (!cluster_info)  		return; -	if (cluster_is_free(&cluster_info[idx])) { -		VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); -		cluster_list_del_first(&p->free_clusters, cluster_info); -		cluster_set_count_flag(&cluster_info[idx], 0, 0); -	} +	if (cluster_is_free(&cluster_info[idx])) +		alloc_cluster(p, idx);  	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);  	cluster_set_count(&cluster_info[idx], @@ -454,21 +489,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,  	cluster_set_count(&cluster_info[idx],  		cluster_count(&cluster_info[idx]) - 1); -	if (cluster_count(&cluster_info[idx]) == 0) { -		/* -		 * If the swap is discardable, prepare discard the cluster -		 * instead of free it immediately. The cluster will be freed -		 * after discard. -		 */ -		if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == -				 (SWP_WRITEOK | SWP_PAGE_DISCARD)) { -			swap_cluster_schedule_discard(p, idx); -			return; -		} - -		cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); -		cluster_list_add_tail(&p->free_clusters, cluster_info, idx); -	} +	if (cluster_count(&cluster_info[idx]) == 0) +		free_cluster(p, idx);  }  /* @@ -558,6 +580,60 @@ new_cluster:  	return found_free;  } +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +			     unsigned int nr_entries) +{ +	unsigned int end = offset + nr_entries - 1; + +	if (offset == si->lowest_bit) +		si->lowest_bit += nr_entries; +	if (end == si->highest_bit) +		si->highest_bit -= nr_entries; +	si->inuse_pages += nr_entries; +	if (si->inuse_pages == si->pages) { +		si->lowest_bit = si->max; +		si->highest_bit = 0; +		spin_lock(&swap_avail_lock); +		plist_del(&si->avail_list, &swap_avail_head); +		spin_unlock(&swap_avail_lock); +	} +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, +			    unsigned int nr_entries) +{ +	unsigned long end = offset + nr_entries - 1; +	void (*swap_slot_free_notify)(struct block_device *, unsigned long); + +	if (offset < si->lowest_bit) +		si->lowest_bit = offset; +	if (end > si->highest_bit) { +		bool was_full = !si->highest_bit; + +		si->highest_bit = end; +		if (was_full && (si->flags & SWP_WRITEOK)) { +			spin_lock(&swap_avail_lock); +			WARN_ON(!plist_node_empty(&si->avail_list)); +			if (plist_node_empty(&si->avail_list)) +				plist_add(&si->avail_list, &swap_avail_head); +			spin_unlock(&swap_avail_lock); +		} +	} +	atomic_long_add(nr_entries, &nr_swap_pages); +	si->inuse_pages -= nr_entries; +	if (si->flags & SWP_BLKDEV) +		swap_slot_free_notify = +			si->bdev->bd_disk->fops->swap_slot_free_notify; +	else +		swap_slot_free_notify = NULL; +	while (offset <= end) { +		frontswap_invalidate_page(si->type, offset); +		if (swap_slot_free_notify) +			swap_slot_free_notify(si->bdev, offset); +		offset++; +	} +} +  static int scan_swap_map_slots(struct swap_info_struct *si,  			       unsigned char usage, int nr,  			       swp_entry_t slots[]) @@ -676,18 +752,7 @@ checks:  	inc_cluster_info_page(si, si->cluster_info, offset);  	unlock_cluster(ci); -	if (offset == si->lowest_bit) -		si->lowest_bit++; -	if (offset == si->highest_bit) -		si->highest_bit--; -	si->inuse_pages++; -	if (si->inuse_pages == si->pages) { -		si->lowest_bit = si->max; -		si->highest_bit = 0; -		spin_lock(&swap_avail_lock); -		plist_del(&si->avail_list, &swap_avail_head); -		spin_unlock(&swap_avail_lock); -	} +	swap_range_alloc(si, offset, 1);  	si->cluster_next = offset + 1;  	slots[n_ret++] = swp_entry(si->type, offset); @@ -766,6 +831,52 @@ no_page:  	return n_ret;  } +#ifdef CONFIG_THP_SWAP +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ +	unsigned long idx; +	struct swap_cluster_info *ci; +	unsigned long offset, i; +	unsigned char *map; + +	if (cluster_list_empty(&si->free_clusters)) +		return 0; + +	idx = cluster_list_first(&si->free_clusters); +	offset = idx * SWAPFILE_CLUSTER; +	ci = lock_cluster(si, offset); +	alloc_cluster(si, idx); +	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + +	map = si->swap_map + offset; +	for (i = 0; i < SWAPFILE_CLUSTER; i++) +		map[i] = SWAP_HAS_CACHE; +	unlock_cluster(ci); +	swap_range_alloc(si, offset, SWAPFILE_CLUSTER); +	*slot = swp_entry(si->type, offset); + +	return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ +	unsigned long offset = idx * SWAPFILE_CLUSTER; +	struct swap_cluster_info *ci; + +	ci = lock_cluster(si, offset); +	cluster_set_count_flag(ci, 0, 0); +	free_cluster(si, idx); +	unlock_cluster(ci); +	swap_range_free(si, offset, SWAPFILE_CLUSTER); +} +#else +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ +	VM_WARN_ON_ONCE(1); +	return 0; +} +#endif /* CONFIG_THP_SWAP */ +  static unsigned long scan_swap_map(struct swap_info_struct *si,  				   unsigned char usage)  { @@ -781,13 +892,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,  } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])  { +	unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;  	struct swap_info_struct *si, *next;  	long avail_pgs;  	int n_ret = 0; -	avail_pgs = atomic_long_read(&nr_swap_pages); +	/* Only single cluster request supported */ +	WARN_ON_ONCE(n_goal > 1 && cluster); + +	avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;  	if (avail_pgs <= 0)  		goto noswap; @@ -797,7 +912,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[])  	if (n_goal > avail_pgs)  		n_goal = avail_pgs; -	atomic_long_sub(n_goal, &nr_swap_pages); +	atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);  	spin_lock(&swap_avail_lock); @@ -823,10 +938,13 @@ start_over:  			spin_unlock(&si->lock);  			goto nextsi;  		} -		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, -					    n_goal, swp_entries); +		if (cluster) +			n_ret = swap_alloc_cluster(si, swp_entries); +		else +			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, +						    n_goal, swp_entries);  		spin_unlock(&si->lock); -		if (n_ret) +		if (n_ret || cluster)  			goto check_out;  		pr_debug("scan_swap_map of si %d failed to find offset\n",  			si->type); @@ -852,7 +970,8 @@ nextsi:  check_out:  	if (n_ret < n_goal) -		atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); +		atomic_long_add((long)(n_goal - n_ret) * nr_pages, +				&nr_swap_pages);  noswap:  	return n_ret;  } @@ -1008,32 +1127,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)  	dec_cluster_info_page(p, p->cluster_info, offset);  	unlock_cluster(ci); -	mem_cgroup_uncharge_swap(entry); -	if (offset < p->lowest_bit) -		p->lowest_bit = offset; -	if (offset > p->highest_bit) { -		bool was_full = !p->highest_bit; - -		p->highest_bit = offset; -		if (was_full && (p->flags & SWP_WRITEOK)) { -			spin_lock(&swap_avail_lock); -			WARN_ON(!plist_node_empty(&p->avail_list)); -			if (plist_node_empty(&p->avail_list)) -				plist_add(&p->avail_list, -					  &swap_avail_head); -			spin_unlock(&swap_avail_lock); -		} -	} -	atomic_long_inc(&nr_swap_pages); -	p->inuse_pages--; -	frontswap_invalidate_page(p->type, offset); -	if (p->flags & SWP_BLKDEV) { -		struct gendisk *disk = p->bdev->bd_disk; - -		if (disk->fops->swap_slot_free_notify) -			disk->fops->swap_slot_free_notify(p->bdev, -							  offset); -	} +	mem_cgroup_uncharge_swap(entry, 1); +	swap_range_free(p, offset, 1);  }  /* @@ -1054,7 +1149,7 @@ void swap_free(swp_entry_t entry)  /*   * Called after dropping swapcache to decrease refcnt to swap entries.   */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry)  {  	struct swap_info_struct *p; @@ -1065,6 +1160,52 @@ void swapcache_free(swp_entry_t entry)  	}  } +#ifdef CONFIG_THP_SWAP +static void swapcache_free_cluster(swp_entry_t entry) +{ +	unsigned long offset = swp_offset(entry); +	unsigned long idx = offset / SWAPFILE_CLUSTER; +	struct swap_cluster_info *ci; +	struct swap_info_struct *si; +	unsigned char *map; +	unsigned int i; + +	si = swap_info_get(entry); +	if (!si) +		return; + +	ci = lock_cluster(si, offset); +	map = si->swap_map + offset; +	for (i = 0; i < SWAPFILE_CLUSTER; i++) { +		VM_BUG_ON(map[i] != SWAP_HAS_CACHE); +		map[i] = 0; +	} +	unlock_cluster(ci); +	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); +	swap_free_cluster(si, idx); +	spin_unlock(&si->lock); +} +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} +#endif /* CONFIG_THP_SWAP */ + +void put_swap_page(struct page *page, swp_entry_t entry) +{ +	if (!PageTransHuge(page)) +		swapcache_free(entry); +	else +		swapcache_free_cluster(entry); +} + +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ +	const swp_entry_t *e1 = ent1, *e2 = ent2; + +	return (int)swp_type(*e1) - (int)swp_type(*e2); +} +  void swapcache_free_entries(swp_entry_t *entries, int n)  {  	struct swap_info_struct *p, *prev; @@ -1075,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n)  	prev = NULL;  	p = NULL; + +	/* +	 * Sort swap entries by swap device, so each lock is only taken once. +	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is +	 * so low that it isn't necessary to optimize further. +	 */ +	if (nr_swapfiles > 1) +		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);  	for (i = 0; i < n; ++i) {  		p = swap_info_get_cont(entries[i], prev);  		if (p) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 34a1c3e46ed7..6211a807cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)  	if (p4d_none(*p4d))  		return NULL;  	pud = pud_offset(p4d, addr); -	if (pud_none(*pud)) + +	/* +	 * Don't dereference bad PUD or PMD (below) entries. This will also +	 * identify huge mappings, which we may encounter on architectures +	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be +	 * identified as vmalloc addresses by is_vmalloc_addr(), but are +	 * not [unambiguously] associated with a struct page, so there is +	 * no correct value to return for them. +	 */ +	WARN_ON_ONCE(pud_bad(*pud)); +	if (pud_none(*pud) || pud_bad(*pud))  		return NULL;  	pmd = pmd_offset(pud, addr); -	if (pmd_none(*pmd)) +	WARN_ON_ONCE(pmd_bad(*pmd)); +	if (pmd_none(*pmd) || pmd_bad(*pmd))  		return NULL;  	ptep = pte_offset_map(pmd, addr); @@ -1759,12 +1770,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,  	 */  	clear_vm_uninitialized_flag(area); -	/* -	 * A ref_count = 2 is needed because vm_struct allocated in -	 * __get_vm_area_node() contains a reference to the virtual address of -	 * the vmalloc'ed block. -	 */ -	kmemleak_alloc(addr, real_size, 2, gfp_mask); +	kmemleak_vmalloc(area, size, gfp_mask);  	return addr; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 6063581f705c..ce0618bfa8d0 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,  	unsigned long pressure = 0;  	/* -	 * reclaimed can be greater than scanned in cases -	 * like THP, where the scanned is 1 and reclaimed -	 * could be 512 +	 * reclaimed can be greater than scanned for things such as reclaimed +	 * slab pages. shrink_node() just adds reclaimed pages without a +	 * related increment to scanned pages.  	 */  	if (reclaimed >= scanned)  		goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..9e95fafc026b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,  		mem_cgroup_swapout(page, swap);  		__delete_from_swap_cache(page);  		spin_unlock_irqrestore(&mapping->tree_lock, flags); -		swapcache_free(swap); +		put_swap_page(page, swap);  	} else {  		void (*freepage)(struct page *);  		void *shadow = NULL; @@ -1125,8 +1125,36 @@ static unsigned long shrink_page_list(struct list_head *page_list,  		    !PageSwapCache(page)) {  			if (!(sc->gfp_mask & __GFP_IO))  				goto keep_locked; -			if (!add_to_swap(page, page_list)) +			if (PageTransHuge(page)) { +				/* cannot split THP, skip it */ +				if (!can_split_huge_page(page, NULL)) +					goto activate_locked; +				/* +				 * Split pages without a PMD map right +				 * away. Chances are some or all of the +				 * tail pages can be freed without IO. +				 */ +				if (!compound_mapcount(page) && +				    split_huge_page_to_list(page, page_list)) +					goto activate_locked; +			} +			if (!add_to_swap(page)) { +				if (!PageTransHuge(page)) +					goto activate_locked; +				/* Split THP and swap individual base pages */ +				if (split_huge_page_to_list(page, page_list)) +					goto activate_locked; +				if (!add_to_swap(page)) +					goto activate_locked; +			} + +			/* XXX: We don't support THP writes */ +			if (PageTransHuge(page) && +				  split_huge_page_to_list(page, page_list)) { +				delete_from_swap_cache(page);  				goto activate_locked; +			} +  			may_enter_fs = 1;  			/* Adding to swap updated mapping */ @@ -1266,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			}  			count_vm_event(PGLAZYFREED); +			count_memcg_page_event(page, PGLAZYFREED);  		} else if (!mapping || !__remove_mapping(mapping, page, true))  			goto keep_locked;  		/* @@ -1295,6 +1324,7 @@ activate_locked:  		if (!PageMlocked(page)) {  			SetPageActive(page);  			pgactivate++; +			count_memcg_page_event(page, PGACTIVATE);  		}  keep_locked:  		unlock_page(page); @@ -1734,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);  	reclaim_stat->recent_scanned[file] += nr_taken; -	if (global_reclaim(sc)) { -		if (current_is_kswapd()) +	if (current_is_kswapd()) { +		if (global_reclaim(sc))  			__count_vm_events(PGSCAN_KSWAPD, nr_scanned); -		else +		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, +				   nr_scanned); +	} else { +		if (global_reclaim(sc))  			__count_vm_events(PGSCAN_DIRECT, nr_scanned); +		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, +				   nr_scanned);  	}  	spin_unlock_irq(&pgdat->lru_lock); @@ -1750,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	spin_lock_irq(&pgdat->lru_lock); -	if (global_reclaim(sc)) { -		if (current_is_kswapd()) +	if (current_is_kswapd()) { +		if (global_reclaim(sc))  			__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); -		else +		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, +				   nr_reclaimed); +	} else { +		if (global_reclaim(sc))  			__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); +		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, +				   nr_reclaimed);  	}  	putback_inactive_pages(lruvec, &page_list); @@ -1899,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec,  		}  	} -	if (!is_active_lru(lru)) +	if (!is_active_lru(lru)) {  		__count_vm_events(PGDEACTIVATE, nr_moved); +		count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, +				   nr_moved); +	}  	return nr_moved;  } @@ -1938,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan,  	reclaim_stat->recent_scanned[file] += nr_taken;  	__count_vm_events(PGREFILL, nr_scanned); +	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);  	spin_unlock_irq(&pgdat->lru_lock); @@ -2967,7 +3011,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  	unsigned long nr_reclaimed;  	struct scan_control sc = {  		.nr_to_reclaim = SWAP_CLUSTER_MAX, -		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), +		.gfp_mask = current_gfp_context(gfp_mask),  		.reclaim_idx = gfp_zone(gfp_mask),  		.order = order,  		.nodemask = nodemask, @@ -2982,12 +3026,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  	 * 1 is returned so that the page allocator does not OOM kill at this  	 * point.  	 */ -	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) +	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))  		return 1;  	trace_mm_vmscan_direct_reclaim_begin(order,  				sc.may_writepage, -				gfp_mask, +				sc.gfp_mask,  				sc.reclaim_idx);  	nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3652,7 +3696,7 @@ int kswapd_run(int nid)  	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);  	if (IS_ERR(pgdat->kswapd)) {  		/* failure at boot is fatal */ -		BUG_ON(system_state == SYSTEM_BOOTING); +		BUG_ON(system_state < SYSTEM_RUNNING);  		pr_err("Failed to start kswapd on node %d\n", nid);  		ret = PTR_ERR(pgdat->kswapd);  		pgdat->kswapd = NULL; @@ -3774,17 +3818,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in  	const unsigned long nr_pages = 1 << order;  	struct task_struct *p = current;  	struct reclaim_state reclaim_state; -	int classzone_idx = gfp_zone(gfp_mask);  	unsigned int noreclaim_flag;  	struct scan_control sc = {  		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), -		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), +		.gfp_mask = current_gfp_context(gfp_mask),  		.order = order,  		.priority = NODE_RECLAIM_PRIORITY,  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),  		.may_swap = 1, -		.reclaim_idx = classzone_idx, +		.reclaim_idx = gfp_zone(gfp_mask),  	};  	cond_resched(); @@ -3795,7 +3838,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in  	 */  	noreclaim_flag = memalloc_noreclaim_save();  	p->flags |= PF_SWAPWRITE; -	lockdep_set_current_reclaim_state(gfp_mask); +	lockdep_set_current_reclaim_state(sc.gfp_mask);  	reclaim_state.reclaimed_slab = 0;  	p->reclaim_state = &reclaim_state; @@ -3831,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)  	 * unmapped file backed pages.  	 */  	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && -	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) +	    node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)  		return NODE_RECLAIM_FULL;  	/* diff --git a/mm/vmstat.c b/mm/vmstat.c index 76f73670200a..744ceaeb42a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = {  	"nr_zone_unevictable",  	"nr_zone_write_pending",  	"nr_mlock", -	"nr_slab_reclaimable", -	"nr_slab_unreclaimable",  	"nr_page_table_pages",  	"nr_kernel_stack",  	"nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = {  	"nr_inactive_file",  	"nr_active_file",  	"nr_unevictable", +	"nr_slab_reclaimable", +	"nr_slab_unreclaimable",  	"nr_isolated_anon",  	"nr_isolated_file",  	"workingset_refault", @@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = {  	"drop_pagecache",  	"drop_slab", +	"oom_kill",  #ifdef CONFIG_NUMA_BALANCING  	"numa_pte_updates", @@ -1223,11 +1224,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {  		struct page *page; -		if (!pfn_valid(pfn)) +		page = pfn_to_online_page(pfn); +		if (!page)  			continue; -		page = pfn_to_page(pfn); -  		/* Watch for unexpected holes punched in the memmap */  		if (!memmap_valid_within(pfn, page, zone))  			continue; @@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file)  	return seq_open(file, &fragmentation_op);  } -static const struct file_operations fragmentation_file_operations = { +static const struct file_operations buddyinfo_file_operations = {  	.open		= fragmentation_open,  	.read		= seq_read,  	.llseek		= seq_lseek, @@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file)  	return seq_open(file, &pagetypeinfo_op);  } -static const struct file_operations pagetypeinfo_file_ops = { +static const struct file_operations pagetypeinfo_file_operations = {  	.open		= pagetypeinfo_open,  	.read		= seq_read,  	.llseek		= seq_lseek, @@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file)  	return seq_open(file, &zoneinfo_op);  } -static const struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations zoneinfo_file_operations = {  	.open		= zoneinfo_open,  	.read		= seq_read,  	.llseek		= seq_lseek, @@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file)  	return seq_open(file, &vmstat_op);  } -static const struct file_operations proc_vmstat_file_operations = { +static const struct file_operations vmstat_file_operations = {  	.open		= vmstat_open,  	.read		= seq_read,  	.llseek		= seq_lseek, @@ -1785,10 +1785,10 @@ void __init init_mm_internals(void)  	start_shepherd_timer();  #endif  #ifdef CONFIG_PROC_FS -	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); -	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); -	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); -	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); +	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); +	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); +	proc_create("vmstat", 0444, NULL, &vmstat_file_operations); +	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);  #endif  } diff --git a/mm/workingset.c b/mm/workingset.c index b8c9ab678479..7119cd745ace 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -288,12 +288,10 @@ bool workingset_refault(void *shadow)  	 */  	refault_distance = (refault - eviction) & EVICTION_MASK; -	inc_node_state(pgdat, WORKINGSET_REFAULT); -	inc_memcg_state(memcg, WORKINGSET_REFAULT); +	inc_lruvec_state(lruvec, WORKINGSET_REFAULT);  	if (refault_distance <= active_file) { -		inc_node_state(pgdat, WORKINGSET_ACTIVATE); -		inc_memcg_state(memcg, WORKINGSET_ACTIVATE); +		inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);  		rcu_read_unlock();  		return true;  	} @@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,  	}  	if (WARN_ON_ONCE(node->exceptional))  		goto out_invalid; -	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); -	inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); +	inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);  	__radix_tree_delete_node(&mapping->page_tree, node,  				 workingset_update_node, mapping); diff --git a/mm/zswap.c b/mm/zswap.c index eedc27894b10..d39581a076c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu)  	u8 *dst;  	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); -	if (!dst) { -		pr_err("can't allocate compressor buffer\n"); +	if (!dst)  		return -ENOMEM; -	} +  	per_cpu(zswap_dstmem, cpu) = dst;  	return 0;  } @@ -515,10 +514,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)  	}  	pool = kzalloc(sizeof(*pool), GFP_KERNEL); -	if (!pool) { -		pr_err("pool alloc failed\n"); +	if (!pool)  		return NULL; -	}  	/* unique name for each pool specifically required by zsmalloc */  	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); @@ -1158,7 +1155,7 @@ static void zswap_frontswap_init(unsigned type)  {  	struct zswap_tree *tree; -	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); +	tree = kzalloc(sizeof(*tree), GFP_KERNEL);  	if (!tree) {  		pr_err("alloc failed, zswap disabled for swap type %d\n", type);  		return; |