diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 132 |
1 files changed, 83 insertions, 49 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5c390f5a5207..97b1e0290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,7 +15,7 @@ #include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/mmdebug.h> @@ -25,6 +25,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> +#include <linux/numa.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - int node = -1; + int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); @@ -919,7 +920,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_migration_supported(h)) + if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1248,10 +1249,11 @@ void free_huge_page(struct page *page) (struct hugepage_subpool *)page_private(page); bool restore_reserve; - set_page_private(page, 0); - page->mapping = NULL; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); + + set_page_private(page, 0); + page->mapping = NULL; restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1585,8 +1587,8 @@ out_unlock: return page; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; @@ -2100,9 +2102,9 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_raw( + addr = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), - 0, BOOTMEM_ALLOC_ACCESSIBLE, node); + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -3233,22 +3235,22 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { - pte_t *src_pte, *dst_pte, entry; + pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - mmun_start = vma->vm_start; - mmun_end = vma->vm_end; - if (cow) - mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + if (cow) { + mmu_notifier_range_init(&range, src, vma->vm_start, + vma->vm_end); + mmu_notifier_invalidate_range_start(&range); + } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; @@ -3261,15 +3263,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* If the pagetables are shared don't copy or take references */ - if (dst_pte == src_pte) + /* + * If the pagetables are shared don't copy or take references. + * dst_pte == src_pte is the common case of src/dest sharing. + * + * However, src could have 'unshared' and dst shares with + * another vma. If dst_pte !none, this implies sharing. + * Check here before taking page table lock, and once again + * after taking the lock below. + */ + dst_entry = huge_ptep_get(dst_pte); + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) continue; dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - if (huge_pte_none(entry)) { /* skip none entry */ + dst_entry = huge_ptep_get(dst_pte); + if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + /* + * Skip if src entry none. Also, skip in the + * unlikely case dst entry !none as this implies + * sharing with another vma. + */ ; } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { @@ -3309,7 +3326,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } if (cow) - mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -3326,8 +3343,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - unsigned long mmun_start = start; /* For mmu_notifiers */ - unsigned long mmun_end = end; /* For mmu_notifiers */ + struct mmu_notifier_range range; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3343,8 +3359,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); @@ -3412,7 +3429,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, if (ref_page) break; } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); } @@ -3530,9 +3547,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, struct page *old_page, *new_page; int outside_reserve = 0; vm_fault_t ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ unsigned long haddr = address & huge_page_mask(h); + struct mmu_notifier_range range; pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3609,11 +3625,9 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); - mmun_start = haddr; - mmun_end = mmun_start + huge_page_size(h); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates @@ -3626,16 +3640,17 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, range.start, range.end); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); out_release_all: restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@ -3690,6 +3705,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); @@ -3709,6 +3730,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3770,7 +3792,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3841,6 +3863,15 @@ retry: } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; @@ -4059,7 +4090,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { - ret = -EFAULT; + ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; @@ -4075,7 +4106,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * the set_pte_at() write. */ __SetPageUptodate(page); - set_page_huge_active(page); mapping = dst_vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -4143,6 +4173,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + set_page_huge_active(page); if (vm_shared) unlock_page(page); ret = 0; @@ -4178,7 +4209,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ - if (unlikely(fatal_signal_pending(current))) { + if (fatal_signal_pending(current)) { remainder = 0; break; } @@ -4248,7 +4279,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; *nr_pages = 0; /* @@ -4318,21 +4350,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; - unsigned long f_start = start; - unsigned long f_end = end; bool shared_pmd = false; + struct mmu_notifier_range range; /* * In the case of shared PMDs, the area to flush could be beyond - * start/end. Set f_start/f_end to cover the maximum possible + * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); + mmu_notifier_range_init(&range, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); - flush_cache_range(vma, f_start, f_end); + flush_cache_range(vma, range.start, range.end); - mmu_notifier_invalidate_range_start(mm, f_start, f_end); + mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4367,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } if (!huge_pte_none(pte)) { - pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte_t old_pte; + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - set_huge_pte_at(mm, address, ptep, pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); @@ -4383,7 +4417,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) - flush_hugetlb_tlb_range(vma, f_start, f_end); + flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* @@ -4393,7 +4427,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, f_start, f_end); + mmu_notifier_invalidate_range_end(&range); return pages << h->order; } |