diff options
Diffstat (limited to 'mm/vmalloc.c')
| -rw-r--r-- | mm/vmalloc.c | 652 | 
1 files changed, 500 insertions, 152 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4f5f8c907897..d33894d7b27a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,7 +34,7 @@  #include <linux/bitops.h>  #include <linux/rbtree_augmented.h>  #include <linux/overflow.h> - +#include <linux/pgtable.h>  #include <linux/uaccess.h>  #include <asm/tlbflush.h>  #include <asm/shmparam.h> @@ -42,6 +42,19 @@  #include "internal.h"  #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ +	vmap_allow_huge = false; +	return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +  bool is_vmalloc_addr(const void *x)  {  	unsigned long addr = (unsigned long)x; @@ -68,6 +81,218 @@ static void free_work(struct work_struct *w)  }  /*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			pgtbl_mod_mask *mask) +{ +	pte_t *pte; +	u64 pfn; + +	pfn = phys_addr >> PAGE_SHIFT; +	pte = pte_alloc_kernel_track(pmd, addr, mask); +	if (!pte) +		return -ENOMEM; +	do { +		BUG_ON(!pte_none(*pte)); +		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); +		pfn++; +	} while (pte++, addr += PAGE_SIZE, addr != end); +	*mask |= PGTBL_PTE_MODIFIED; +	return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift) +{ +	if (max_page_shift < PMD_SHIFT) +		return 0; + +	if (!arch_vmap_pmd_supported(prot)) +		return 0; + +	if ((end - addr) != PMD_SIZE) +		return 0; + +	if (!IS_ALIGNED(addr, PMD_SIZE)) +		return 0; + +	if (!IS_ALIGNED(phys_addr, PMD_SIZE)) +		return 0; + +	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) +		return 0; + +	return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ +	pmd_t *pmd; +	unsigned long next; + +	pmd = pmd_alloc_track(&init_mm, pud, addr, mask); +	if (!pmd) +		return -ENOMEM; +	do { +		next = pmd_addr_end(addr, end); + +		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, +					max_page_shift)) { +			*mask |= PGTBL_PMD_MODIFIED; +			continue; +		} + +		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) +			return -ENOMEM; +	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end); +	return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift) +{ +	if (max_page_shift < PUD_SHIFT) +		return 0; + +	if (!arch_vmap_pud_supported(prot)) +		return 0; + +	if ((end - addr) != PUD_SIZE) +		return 0; + +	if (!IS_ALIGNED(addr, PUD_SIZE)) +		return 0; + +	if (!IS_ALIGNED(phys_addr, PUD_SIZE)) +		return 0; + +	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) +		return 0; + +	return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ +	pud_t *pud; +	unsigned long next; + +	pud = pud_alloc_track(&init_mm, p4d, addr, mask); +	if (!pud) +		return -ENOMEM; +	do { +		next = pud_addr_end(addr, end); + +		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, +					max_page_shift)) { +			*mask |= PGTBL_PUD_MODIFIED; +			continue; +		} + +		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, +					max_page_shift, mask)) +			return -ENOMEM; +	} while (pud++, phys_addr += (next - addr), addr = next, addr != end); +	return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift) +{ +	if (max_page_shift < P4D_SHIFT) +		return 0; + +	if (!arch_vmap_p4d_supported(prot)) +		return 0; + +	if ((end - addr) != P4D_SIZE) +		return 0; + +	if (!IS_ALIGNED(addr, P4D_SIZE)) +		return 0; + +	if (!IS_ALIGNED(phys_addr, P4D_SIZE)) +		return 0; + +	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) +		return 0; + +	return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ +	p4d_t *p4d; +	unsigned long next; + +	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); +	if (!p4d) +		return -ENOMEM; +	do { +		next = p4d_addr_end(addr, end); + +		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, +					max_page_shift)) { +			*mask |= PGTBL_P4D_MODIFIED; +			continue; +		} + +		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, +					max_page_shift, mask)) +			return -ENOMEM; +	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end); +	return 0; +} + +static int vmap_range_noflush(unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift) +{ +	pgd_t *pgd; +	unsigned long start; +	unsigned long next; +	int err; +	pgtbl_mod_mask mask = 0; + +	might_sleep(); +	BUG_ON(addr >= end); + +	start = addr; +	pgd = pgd_offset_k(addr); +	do { +		next = pgd_addr_end(addr, end); +		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, +					max_page_shift, &mask); +		if (err) +			break; +	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + +	if (mask & ARCH_PAGE_TABLE_SYNC_MASK) +		arch_sync_kernel_mappings(start, end); + +	return err; +} + +int vmap_range(unsigned long addr, unsigned long end, +			phys_addr_t phys_addr, pgprot_t prot, +			unsigned int max_page_shift) +{ +	int err; + +	err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); +	flush_cache_vmap(addr, end); + +	return err; +}  static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  			     pgtbl_mod_mask *mask) @@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,  	} while (p4d++, addr = next, addr != end);  } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs.   * - * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced).   * - * NOTE: - * This function does NOT do any cache flushing.  The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * This is an internal function only. Do not use outside mm/.   */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +void vunmap_range_noflush(unsigned long start, unsigned long end)  { -	unsigned long end = start + size;  	unsigned long next;  	pgd_t *pgd;  	unsigned long addr = start; @@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)  		arch_sync_kernel_mappings(start, end);  } -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ +	flush_cache_vunmap(addr, end); +	vunmap_range_noflush(addr, end); +	flush_tlb_kernel_range(addr, end); +} + +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,  		pgtbl_mod_mask *mask)  { @@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,  	return 0;  } -static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,  		pgtbl_mod_mask *mask)  { @@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,  		return -ENOMEM;  	do {  		next = pmd_addr_end(addr, end); -		if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) +		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))  			return -ENOMEM;  	} while (pmd++, addr = next, addr != end);  	return 0;  } -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,  		pgtbl_mod_mask *mask)  { @@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,  		return -ENOMEM;  	do {  		next = pud_addr_end(addr, end); -		if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) +		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))  			return -ENOMEM;  	} while (pud++, addr = next, addr != end);  	return 0;  } -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,  		unsigned long end, pgprot_t prot, struct page **pages, int *nr,  		pgtbl_mod_mask *mask)  { @@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,  		return -ENOMEM;  	do {  		next = p4d_addr_end(addr, end); -		if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) +		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))  			return -ENOMEM;  	} while (p4d++, addr = next, addr != end);  	return 0;  } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing.  The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, -			     pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, +		pgprot_t prot, struct page **pages)  {  	unsigned long start = addr; -	unsigned long end = addr + size; -	unsigned long next;  	pgd_t *pgd; +	unsigned long next;  	int err = 0;  	int nr = 0;  	pgtbl_mod_mask mask = 0; @@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,  		next = pgd_addr_end(addr, end);  		if (pgd_bad(*pgd))  			mask |= PGTBL_PGD_MODIFIED; -		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); +		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);  		if (err)  			return err;  	} while (pgd++, addr = next, addr != end); @@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,  	return 0;  } -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, -		struct page **pages) +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +		pgprot_t prot, struct page **pages, unsigned int page_shift)  { -	int ret; +	unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + +	WARN_ON(page_shift < PAGE_SHIFT); + +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || +			page_shift == PAGE_SHIFT) +		return vmap_small_pages_range_noflush(addr, end, prot, pages); + +	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { +		int err; + +		err = vmap_range_noflush(addr, addr + (1UL << page_shift), +					__pa(page_address(pages[i])), prot, +					page_shift); +		if (err) +			return err; + +		addr += 1UL << page_shift; +	} + +	return 0; +} + +/** + * vmap_pages_range - map pages to a kernel virtual address + * @addr: start of the VM area to map + * @end: end of the VM area to map (non-inclusive) + * @prot: page protection flags to use + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int vmap_pages_range(unsigned long addr, unsigned long end, +		pgprot_t prot, struct page **pages, unsigned int page_shift) +{ +	int err; -	ret = map_kernel_range_noflush(start, size, prot, pages); -	flush_cache_vmap(start, start + size); -	return ret; +	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +	flush_cache_vmap(addr, end); +	return err;  }  int is_vmalloc_or_module_addr(const void *x) @@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x)  }  /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings.   */  struct page *vmalloc_to_page(const void *vmalloc_addr)  { @@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)  	if (pgd_none(*pgd))  		return NULL; +	if (WARN_ON_ONCE(pgd_leaf(*pgd))) +		return NULL; /* XXX: no allowance for huge pgd */ +	if (WARN_ON_ONCE(pgd_bad(*pgd))) +		return NULL; +  	p4d = p4d_offset(pgd, addr);  	if (p4d_none(*p4d))  		return NULL; -	pud = pud_offset(p4d, addr); +	if (p4d_leaf(*p4d)) +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); +	if (WARN_ON_ONCE(p4d_bad(*p4d))) +		return NULL; -	/* -	 * Don't dereference bad PUD or PMD (below) entries. This will also -	 * identify huge mappings, which we may encounter on architectures -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are -	 * not [unambiguously] associated with a struct page, so there is -	 * no correct value to return for them. -	 */ -	WARN_ON_ONCE(pud_bad(*pud)); -	if (pud_none(*pud) || pud_bad(*pud)) +	pud = pud_offset(p4d, addr); +	if (pud_none(*pud)) +		return NULL; +	if (pud_leaf(*pud)) +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); +	if (WARN_ON_ONCE(pud_bad(*pud)))  		return NULL; +  	pmd = pmd_offset(pud, addr); -	WARN_ON_ONCE(pmd_bad(*pmd)); -	if (pmd_none(*pmd) || pmd_bad(*pmd)) +	if (pmd_none(*pmd)) +		return NULL; +	if (pmd_leaf(*pmd)) +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); +	if (WARN_ON_ONCE(pmd_bad(*pmd)))  		return NULL;  	ptep = pte_offset_map(pmd, addr); @@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)  	if (pte_present(pte))  		page = pte_page(pte);  	pte_unmap(ptep); +  	return page;  }  EXPORT_SYMBOL(vmalloc_to_page); @@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va)  	spin_unlock(&free_vmap_area_lock);  } +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ +	struct vmap_area *va = NULL; + +	/* +	 * Preload this CPU with one extra vmap_area object. It is used +	 * when fit type of free area is NE_FIT_TYPE. It guarantees that +	 * a CPU that does an allocation is preloaded. +	 * +	 * We do it in non-atomic context, thus it allows us to use more +	 * permissive allocation masks to be more stable under low memory +	 * condition and high memory pressure. +	 */ +	if (!this_cpu_read(ne_fit_preload_node)) +		va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + +	spin_lock(lock); + +	if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) +		kmem_cache_free(vmap_area_cachep, va); +} +  /*   * Allocate a region of KVA of the specified size and alignment, within the   * vstart and vend. @@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,  				unsigned long vstart, unsigned long vend,  				int node, gfp_t gfp_mask)  { -	struct vmap_area *va, *pva; +	struct vmap_area *va;  	unsigned long addr;  	int purged = 0;  	int ret; @@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,  	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);  retry: -	/* -	 * Preload this CPU with one extra vmap_area object. It is used -	 * when fit type of free area is NE_FIT_TYPE. Please note, it -	 * does not guarantee that an allocation occurs on a CPU that -	 * is preloaded, instead we minimize the case when it is not. -	 * It can happen because of cpu migration, because there is a -	 * race until the below spinlock is taken. -	 * -	 * The preload is done in non-atomic context, thus it allows us -	 * to use more permissive allocation masks to be more stable under -	 * low memory condition and high memory pressure. In rare case, -	 * if not preloaded, GFP_NOWAIT is used. -	 * -	 * Set "pva" to NULL here, because of "retry" path. -	 */ -	pva = NULL; - -	if (!this_cpu_read(ne_fit_preload_node)) -		/* -		 * Even if it fails we do not really care about that. -		 * Just proceed as it is. If needed "overflow" path -		 * will refill the cache we allocate from. -		 */ -		pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - -	spin_lock(&free_vmap_area_lock); - -	if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) -		kmem_cache_free(vmap_area_cachep, pva); +	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); +	addr = __alloc_vmap_area(size, align, vstart, vend); +	spin_unlock(&free_vmap_area_lock);  	/*  	 * If an allocation fails, the "vend" address is  	 * returned. Therefore trigger the overflow path.  	 */ -	addr = __alloc_vmap_area(size, align, vstart, vend); -	spin_unlock(&free_vmap_area_lock); -  	if (unlikely(addr == vend))  		goto overflow; @@ -1231,7 +1503,6 @@ retry:  	va->va_end = addr + size;  	va->vm = NULL; -  	spin_lock(&vmap_area_lock);  	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);  	spin_unlock(&vmap_area_lock); @@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)  static void free_unmap_vmap_area(struct vmap_area *va)  {  	flush_cache_vunmap(va->va_start, va->va_end); -	unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); +	vunmap_range_noflush(va->va_start, va->va_end);  	if (debug_pagealloc_enabled_static())  		flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size)  	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;  	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); -	unmap_kernel_range_noflush(addr, size); +	vunmap_range_noflush(addr, addr + size);  	if (debug_pagealloc_enabled_static())  		flush_tlb_kernel_range(addr, addr + size); @@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)  		rcu_read_lock();  		list_for_each_entry_rcu(vb, &vbq->free, free_list) {  			spin_lock(&vb->lock); -			if (vb->dirty) { +			if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {  				unsigned long va_start = vb->va->va_start;  				unsigned long s, e; @@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)  	kasan_unpoison_vmalloc(mem, size); -	if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { +	if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, +				pages, PAGE_SHIFT) < 0) {  		vm_unmap_ram(mem, count);  		return NULL;  	} +  	return mem;  }  EXPORT_SYMBOL(vm_map_ram);  static struct vm_struct *vmlist __initdata; +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +	return vm->page_order; +#else +	return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +	vm->page_order = order; +#else +	BUG_ON(order != 0); +#endif +} +  /**   * vm_area_add_early - add vmap area early during boot   * @vm: vm_struct to add @@ -2023,23 +2314,6 @@ void __init vmalloc_init(void)  	vmap_initialized = true;  } -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ -void unmap_kernel_range(unsigned long addr, unsigned long size) -{ -	unsigned long end = addr + size; - -	flush_cache_vunmap(addr, end); -	unmap_kernel_range_noflush(addr, size); -	flush_tlb_kernel_range(addr, end); -} -  static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,  	struct vmap_area *va, unsigned long flags, const void *caller)  { @@ -2199,6 +2473,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,  {  	int i; +	/* HUGE_VMALLOC passes small pages to set_direct_map */  	for (i = 0; i < area->nr_pages; i++)  		if (page_address(area->pages[i]))  			set_direct_map(area->pages[i]); @@ -2208,6 +2483,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,  static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)  {  	unsigned long start = ULONG_MAX, end = 0; +	unsigned int page_order = vm_area_page_order(area);  	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;  	int flush_dmap = 0;  	int i; @@ -2232,11 +2508,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)  	 * map. Find the start and end range of the direct mappings to make sure  	 * the vm_unmap_aliases() flush includes the direct map.  	 */ -	for (i = 0; i < area->nr_pages; i++) { +	for (i = 0; i < area->nr_pages; i += 1U << page_order) {  		unsigned long addr = (unsigned long)page_address(area->pages[i]);  		if (addr) { +			unsigned long page_size; + +			page_size = PAGE_SIZE << page_order;  			start = min(addr, start); -			end = max(addr + PAGE_SIZE, end); +			end = max(addr + page_size, end);  			flush_dmap = 1;  		}  	} @@ -2277,13 +2556,14 @@ static void __vunmap(const void *addr, int deallocate_pages)  	vm_remove_mappings(area, deallocate_pages);  	if (deallocate_pages) { +		unsigned int page_order = vm_area_page_order(area);  		int i; -		for (i = 0; i < area->nr_pages; i++) { +		for (i = 0; i < area->nr_pages; i += 1U << page_order) {  			struct page *page = area->pages[i];  			BUG_ON(!page); -			__free_pages(page, 0); +			__free_pages(page, page_order);  		}  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2402,6 +2682,7 @@ void *vmap(struct page **pages, unsigned int count,  	   unsigned long flags, pgprot_t prot)  {  	struct vm_struct *area; +	unsigned long addr;  	unsigned long size;		/* In bytes */  	might_sleep(); @@ -2414,8 +2695,9 @@ void *vmap(struct page **pages, unsigned int count,  	if (!area)  		return NULL; -	if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), -			pages) < 0) { +	addr = (unsigned long)area->addr; +	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), +				pages, PAGE_SHIFT) < 0) {  		vunmap(area->addr);  		return NULL;  	} @@ -2474,15 +2756,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);  #endif /* CONFIG_VMAP_PFN */  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, -				 pgprot_t prot, int node) +				 pgprot_t prot, unsigned int page_shift, +				 int node)  {  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; -	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; +	unsigned long addr = (unsigned long)area->addr; +	unsigned long size = get_vm_area_size(area);  	unsigned long array_size; -	unsigned int i; +	unsigned int nr_small_pages = size >> PAGE_SHIFT; +	unsigned int page_order;  	struct page **pages; +	unsigned int i; -	array_size = (unsigned long)nr_pages * sizeof(struct page *); +	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);  	gfp_mask |= __GFP_NOWARN;  	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))  		gfp_mask |= __GFP_HIGHMEM; @@ -2497,42 +2783,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,  	if (!pages) {  		free_vm_area(area); +		warn_alloc(gfp_mask, NULL, +			   "vmalloc size %lu allocation failure: " +			   "page array size %lu allocation failed", +			   nr_small_pages * PAGE_SIZE, array_size);  		return NULL;  	}  	area->pages = pages; -	area->nr_pages = nr_pages; +	area->nr_pages = nr_small_pages; +	set_vm_area_page_order(area, page_shift - PAGE_SHIFT); -	for (i = 0; i < area->nr_pages; i++) { -		struct page *page; +	page_order = vm_area_page_order(area); -		if (node == NUMA_NO_NODE) -			page = alloc_page(gfp_mask); -		else -			page = alloc_pages_node(node, gfp_mask, 0); +	/* +	 * Careful, we allocate and map page_order pages, but tracking is done +	 * per PAGE_SIZE page so as to keep the vm_struct APIs independent of +	 * the physical/mapped size. +	 */ +	for (i = 0; i < area->nr_pages; i += 1U << page_order) { +		struct page *page; +		int p; +		/* Compound pages required for remap_vmalloc_page */ +		page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);  		if (unlikely(!page)) {  			/* Successfully allocated i pages, free them in __vfree() */  			area->nr_pages = i;  			atomic_long_add(area->nr_pages, &nr_vmalloc_pages); +			warn_alloc(gfp_mask, NULL, +				   "vmalloc size %lu allocation failure: " +				   "page order %u allocation failed", +				   area->nr_pages * PAGE_SIZE, page_order);  			goto fail;  		} -		area->pages[i] = page; + +		for (p = 0; p < (1U << page_order); p++) +			area->pages[i + p] = page + p; +  		if (gfpflags_allow_blocking(gfp_mask))  			cond_resched();  	}  	atomic_long_add(area->nr_pages, &nr_vmalloc_pages); -	if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), -			prot, pages) < 0) +	if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { +		warn_alloc(gfp_mask, NULL, +			   "vmalloc size %lu allocation failure: " +			   "failed to map pages", +			   area->nr_pages * PAGE_SIZE);  		goto fail; +	}  	return area->addr;  fail: -	warn_alloc(gfp_mask, NULL, -			  "vmalloc: allocation failure, allocated %ld of %ld bytes", -			  (area->nr_pages*PAGE_SIZE), area->size);  	__vfree(area->addr);  	return NULL;  } @@ -2563,19 +2867,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,  	struct vm_struct *area;  	void *addr;  	unsigned long real_size = size; +	unsigned long real_align = align; +	unsigned int shift = PAGE_SHIFT; -	size = PAGE_ALIGN(size); -	if (!size || (size >> PAGE_SHIFT) > totalram_pages()) -		goto fail; +	if (WARN_ON_ONCE(!size)) +		return NULL; + +	if ((size >> PAGE_SHIFT) > totalram_pages()) { +		warn_alloc(gfp_mask, NULL, +			   "vmalloc size %lu allocation failure: " +			   "exceeds total pages", real_size); +		return NULL; +	} + +	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && +			arch_vmap_pmd_supported(prot)) { +		unsigned long size_per_node; -	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | +		/* +		 * Try huge pages. Only try for PAGE_KERNEL allocations, +		 * others like modules don't yet expect huge pages in +		 * their allocations due to apply_to_page_range not +		 * supporting them. +		 */ + +		size_per_node = size; +		if (node == NUMA_NO_NODE) +			size_per_node /= num_online_nodes(); +		if (size_per_node >= PMD_SIZE) { +			shift = PMD_SHIFT; +			align = max(real_align, 1UL << shift); +			size = ALIGN(real_size, 1UL << shift); +		} +	} + +again: +	size = PAGE_ALIGN(size); +	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |  				vm_flags, start, end, node, gfp_mask, caller); -	if (!area) +	if (!area) { +		warn_alloc(gfp_mask, NULL, +			   "vmalloc size %lu allocation failure: " +			   "vm_struct allocation failed", real_size);  		goto fail; +	} -	addr = __vmalloc_area_node(area, gfp_mask, prot, node); +	addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);  	if (!addr) -		return NULL; +		goto fail;  	/*  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2589,8 +2928,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,  	return addr;  fail: -	warn_alloc(gfp_mask, NULL, -			  "vmalloc: allocation failure: %lu bytes", real_size); +	if (shift > PAGE_SHIFT) { +		shift = PAGE_SHIFT; +		align = real_align; +		size = real_size; +		goto again; +	} +  	return NULL;  } @@ -2894,7 +3238,10 @@ long vread(char *buf, char *addr, unsigned long count)  		count = -(unsigned long) addr;  	spin_lock(&vmap_area_lock); -	list_for_each_entry(va, &vmap_area_list, list) { +	va = __find_vmap_area((unsigned long)addr); +	if (!va) +		goto finished; +	list_for_each_entry_from(va, &vmap_area_list, list) {  		if (!count)  			break; @@ -3072,7 +3419,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,  	return 0;  } -EXPORT_SYMBOL(remap_vmalloc_range_partial);  /**   * remap_vmalloc_range - map vmalloc pages to userspace @@ -3450,6 +3796,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)  }  #endif	/* CONFIG_SMP */ +#ifdef CONFIG_PRINTK  bool vmalloc_dump_obj(void *object)  {  	struct vm_struct *vm; @@ -3462,6 +3809,7 @@ bool vmalloc_dump_obj(void *object)  		vm->nr_pages, (unsigned long)vm->addr, vm->caller);  	return true;  } +#endif  #ifdef CONFIG_PROC_FS  static void *s_start(struct seq_file *m, loff_t *pos)  |