aboutsummaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c132
1 files changed, 83 insertions, 49 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5c390f5a5207..97b1e0290c66 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -15,7 +15,7 @@
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/mmdebug.h>
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +920,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1248,10 +1249,11 @@ void free_huge_page(struct page *page)
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
- set_page_private(page, 0);
- page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -1585,8 +1587,8 @@ out_unlock:
return page;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
@@ -2100,9 +2102,9 @@ int __alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_raw(
+ addr = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -3233,22 +3235,22 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
- pte_t *src_pte, *dst_pte, entry;
+ pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- mmun_start = vma->vm_start;
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+ if (cow) {
+ mmu_notifier_range_init(&range, src, vma->vm_start,
+ vma->vm_end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
@@ -3261,15 +3263,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /* If the pagetables are shared don't copy or take references */
- if (dst_pte == src_pte)
+ /*
+ * If the pagetables are shared don't copy or take references.
+ * dst_pte == src_pte is the common case of src/dest sharing.
+ *
+ * However, src could have 'unshared' and dst shares with
+ * another vma. If dst_pte !none, this implies sharing.
+ * Check here before taking page table lock, and once again
+ * after taking the lock below.
+ */
+ dst_entry = huge_ptep_get(dst_pte);
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- if (huge_pte_none(entry)) { /* skip none entry */
+ dst_entry = huge_ptep_get(dst_pte);
+ if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ /*
+ * Skip if src entry none. Also, skip in the
+ * unlikely case dst entry !none as this implies
+ * sharing with another vma.
+ */
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
@@ -3309,7 +3326,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -3326,8 +3343,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start = start; /* For mmu_notifiers */
- unsigned long mmun_end = end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3343,8 +3359,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
@@ -3412,7 +3429,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
}
@@ -3530,9 +3547,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
unsigned long haddr = address & huge_page_mask(h);
+ struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3609,11 +3625,9 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
- mmun_start = haddr;
- mmun_end = mmun_start + huge_page_size(h);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
@@ -3626,16 +3640,17 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out_release_all:
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -3690,6 +3705,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return err;
ClearPagePrivate(page);
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
@@ -3709,6 +3730,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3770,7 +3792,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3841,6 +3863,15 @@ retry:
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -4059,7 +4090,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_sem */
if (unlikely(ret)) {
- ret = -EFAULT;
+ ret = -ENOENT;
*pagep = page;
/* don't free the page */
goto out;
@@ -4075,7 +4106,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4143,6 +4173,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4178,7 +4209,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
@@ -4248,7 +4279,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4318,21 +4350,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
- unsigned long f_start = start;
- unsigned long f_end = end;
bool shared_pmd = false;
+ struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
- * start/end. Set f_start/f_end to cover the maximum possible
+ * start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+ mmu_notifier_range_init(&range, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
- flush_cache_range(vma, f_start, f_end);
+ flush_cache_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4367,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
continue;
}
if (!huge_pte_none(pte)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
- set_huge_pte_at(mm, address, ptep, pte);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
@@ -4383,7 +4417,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
- flush_hugetlb_tlb_range(vma, f_start, f_end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
@@ -4393,7 +4427,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}