From 4fd017708c4a067da51a2b5cf8aedddf4e840b1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 12 Oct 2011 21:06:51 +0200 Subject: mm: Check if PTE is already allocated during page fault With transparent hugepage support, handle_mm_fault() has to be careful that a normal PMD has been established before handling a PTE fault. To achieve this, it used __pte_alloc() directly instead of pte_alloc_map as pte_alloc_map is unsafe to run against a huge PMD. pte_offset_map() is called once it is known the PMD is safe. pte_alloc_map() is smart enough to check if a PTE is already present before calling __pte_alloc but this check was lost. As a consequence, PTEs may be allocated unnecessarily and the page table lock taken. Thi useless PTE does get cleaned up but it's a performance hit which is visible in page_test from aim9. This patch simply re-adds the check normally done by pte_alloc_map to check if the PTE needs to be allocated before taking the page table lock. The effect is noticable in page_test from aim9. AIM9 2.6.38-vanilla 2.6.38-checkptenone creat-clo 446.10 ( 0.00%) 424.47 (-5.10%) page_test 38.10 ( 0.00%) 42.04 ( 9.37%) brk_test 52.45 ( 0.00%) 51.57 (-1.71%) exec_test 382.00 ( 0.00%) 456.90 (16.39%) fork_test 60.11 ( 0.00%) 67.79 (11.34%) MMTests Statistics: duration Total Elapsed Time (seconds) 611.90 612.22 (While this affects 2.6.38, it is a performance rather than a functional bug and normally outside the rules -stable. While the big performance differences are to a microbench, the difference in fork and exec performance may be significant enough that -stable wants to consider the patch) Reported-by: Raz Ben Yehuda Signed-off-by: Mel Gorman Signed-off-by: Andrea Arcangeli Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Rik van Riel [ Picked this up from the AutoNUMA tree to help it upstream and to allow apples-to-apples performance comparisons. ] Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..35c66a269bcc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -710,7 +710,8 @@ out: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) -- cgit From 1ba6e0b50b479cbadb8f05ebde3020da9ac87201 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 4 Oct 2012 01:51:06 +0200 Subject: mm: numa: split_huge_page: transfer the NUMA type from the pmd to the pte When we split a transparent hugepage, transfer the NUMA type from the pmd to the pte if needed. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 35c66a269bcc..cd24aa562144 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1364,6 +1364,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); -- cgit From d10e63f29488b0f312a443f9507ea9b6fd3c9090 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:31 +0200 Subject: mm: numa: Create basic numa page hinting infrastructure Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/huge_mm.h | 10 +++++ mm/huge_memory.c | 22 ++++++++++ mm/memory.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 3 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b31cb7da0346..a1d26a98c655 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } + +extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, { return 0; } + +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp) +{ +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cd24aa562144..f5f37630c54d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1018,6 +1018,28 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, + pmd_t pmd, pmd_t *pmdp) +{ + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + page = pmd_page(pmd); + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { diff --git a/mm/memory.c b/mm/memory.c index cd8e0daf1912..e30616f2cc3d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page; + spinlock_t *ptl; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * ptep_modify_prot_start is not called as this is clearing + * the _PAGE_NUMA bit and it is not really expected that there + * would be concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) + goto out_unlock; + pte = pte_mknonnuma(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + +out_unlock: + pte_unmap_unlock(ptep, ptl); + return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + pte_t *pte, *orig_pte; + unsigned long _addr = addr & PMD_MASK; + unsigned long offset; + spinlock_t *ptl; + bool numa = false; + + spin_lock(&mm->page_table_lock); + pmd = *pmdp; + if (pmd_numa(pmd)) { + set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + numa = true; + } + spin_unlock(&mm->page_table_lock); + + if (!numa) + return 0; + + /* we're in a page fault so some vma must be in the range */ + BUG_ON(!vma); + BUG_ON(vma->vm_start >= _addr + PMD_SIZE); + offset = max(_addr, vma->vm_start) & ~PMD_MASK; + VM_BUG_ON(offset >= PMD_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); + pte += offset >> PAGE_SHIFT; + for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { + pte_t pteval = *pte; + struct page *page; + if (!pte_present(pteval)) + continue; + if (!pte_numa(pteval)) + continue; + if (addr >= vma->vm_end) { + vma = find_vma(mm, addr); + /* there's a pte present so there must be a vma */ + BUG_ON(!vma); + BUG_ON(addr < vma->vm_start); + } + if (pte_numa(pteval)) { + pteval = pte_mknonnuma(pteval); + set_pte_at(mm, addr, pte, pteval); + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page)) + continue; + } + pte_unmap_unlock(orig_pte, ptl); + + return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } + if (pte_numa(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3554,9 +3654,11 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_numa(*pmd)) + return do_huge_pmd_numa_page(mm, address, + orig_pmd, pmd); + + if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3568,10 +3670,14 @@ retry: goto retry; return ret; } + return 0; } } + if (pmd_numa(*pmd)) + return do_pmd_numa_page(mm, vma, address, pmd); + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- cgit From 4daae3b4b9e49b7e0935499a352f1c59d90287d2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 11:33:45 +0000 Subject: mm: mempolicy: Use _PAGE_NUMA to migrate pages Note: Based on "mm/mpol: Use special PROT_NONE to migrate pages" but sufficiently different that the signed-off-bys were dropped Combine our previous _PAGE_NUMA, mpol_misplaced and migrate_misplaced_page() pieces into an effective migrate on fault scheme. Note that (on x86) we rely on PROT_NONE pages being !present and avoid the TLB flush from try_to_unmap(TTU_MIGRATION). This greatly improves the page-migration performance. Based-on-work-by: Peter Zijlstra Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 9 +++++---- mm/huge_memory.c | 31 ++++++++++++++++++++++++++++--- mm/memory.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 60 insertions(+), 12 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1d26a98c655..dabb5108d6c0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -160,8 +160,8 @@ static inline struct page *compound_trans_head(struct page *page) return page; } -extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp); +extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) @@ -200,9 +200,10 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, return 0; } -static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5f37630c54d..5723b551c023 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1019,17 +1020,39 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, - pmd_t pmd, pmd_t *pmdp) +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page; + struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); + get_page(page); + spin_unlock(&mm->page_table_lock); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) + goto clear_pmdnuma; + + /* + * Due to lacking code to migrate thp pages, we'll split + * (which preserves the special PROT_NONE) and re-take the + * fault on the normal pages. + */ + split_huge_page(page); + put_page(page); + return 0; + +clear_pmdnuma: + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); @@ -1037,6 +1060,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, out_unlock: spin_unlock(&mm->page_table_lock); + if (page) + put_page(page); return 0; } diff --git a/mm/memory.c b/mm/memory.c index e30616f2cc3d..d52542680e10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -3451,8 +3452,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { - struct page *page; + struct page *page = NULL; spinlock_t *ptl; + int current_nid, target_nid; /* * The "pte" at this point cannot be used safely without @@ -3465,8 +3467,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); - if (unlikely(!pte_same(*ptep, pte))) - goto out_unlock; + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + pte = pte_mknonnuma(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3477,8 +3482,25 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -out_unlock: + get_page(page); + current_nid = page_to_nid(page); + target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + /* + * Account for the fault against the current node if it not + * being replaced regardless of where the page is located. + */ + current_nid = numa_node_id(); + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, target_nid)) + current_nid = target_nid; + +out: return 0; } @@ -3655,7 +3677,7 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { if (pmd_numa(*pmd)) - return do_huge_pmd_numa_page(mm, address, + return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { -- cgit From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 3 +- include/linux/mm.h | 4 +- mm/huge_memory.c | 14 ++++- mm/mempolicy.c | 137 +++++------------------------------------------- mm/mprotect.c | 72 +++++++++++++++++++------ 5 files changed, 85 insertions(+), 145 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index dabb5108d6c0..027ad04ef3a8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,7 +27,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/mm.h b/include/linux/mm.h index 471185e29bab..d04c2f0aab36 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1080,7 +1080,7 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long flags, unsigned long new_addr); extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable); + int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1552,7 +1552,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) #endif #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -void change_prot_numa(struct vm_area_struct *vma, +unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5723b551c023..d79f7a55bf6f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1147,7 +1147,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1155,7 +1155,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 51d3ebd8561e..75d4600a5e92 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -568,134 +568,23 @@ static inline int check_pgd_range(struct vm_area_struct *vma, #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* - * Here we search for not shared page mappings (mapcount == 1) and we - * set up the pmd/pte_numa on those mappings so the very next access - * will fire a NUMA hinting page fault. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. */ -static int -change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, *_pte; - struct page *page; - unsigned long _address, end; - spinlock_t *ptl; - int ret = 0; - - VM_BUG_ON(address & ~PAGE_MASK); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; - - if (pmd_trans_huge_lock(pmd, vma) == 1) { - int page_nid; - ret = HPAGE_PMD_NR; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page = pmd_page(*pmd); - - /* only check non-shared pages */ - if (page_mapcount(page) != 1) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page_nid = page_to_nid(page); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - ret += HPAGE_PMD_NR; - /* defer TLB flush to lower the overhead */ - spin_unlock(&mm->page_table_lock); - goto out; - } - - if (pmd_trans_unstable(pmd)) - goto out; - VM_BUG_ON(!pmd_present(*pmd)); - - end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _address < end; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (!pte_present(pteval)) - continue; - if (pte_numa(pteval)) - continue; - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (page_mapcount(page) != 1) - continue; - - set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); - - /* defer TLB flush to lower the overhead */ - ret++; - } - pte_unmap_unlock(pte, ptl); - - if (ret && !pmd_numa(*pmd)) { - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); - /* defer TLB flush to lower the overhead */ - } - -out: - return ret; -} - -/* Assumes mmap_sem is held */ -void -change_prot_numa(struct vm_area_struct *vma, - unsigned long address, unsigned long end) +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; - int progress = 0; - - while (address < end) { - VM_BUG_ON(address < vma->vm_start || - address + PAGE_SIZE > vma->vm_end); + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - progress += change_prot_numa_range(mm, vma, address); - address = (address + PMD_SIZE) & PMD_MASK; - } + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - /* - * Flush the TLB for the mm to start the NUMA hinting - * page faults after we finish scanning this vma part - * if there were any PTE updates - */ - if (progress) { - mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); - flush_tlb_range(vma, address, end); - mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); - } + return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, diff --git a/mm/mprotect.c b/mm/mprotect.c index 7c3628a8b486..7ef6ae964e8f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,10 +35,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -49,19 +50,39 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); - pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -83,9 +104,25 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -97,7 +134,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -105,8 +142,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + + if (prot_numa) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; @@ -114,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -126,7 +166,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); return pages; @@ -134,7 +174,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -150,7 +190,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -162,7 +202,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; unsigned long pages; @@ -171,7 +211,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); mmu_notifier_invalidate_range_end(mm, start, end); return pages; @@ -249,7 +289,7 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); -- cgit From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- arch/sh/mm/Kconfig | 1 + arch/x86/Kconfig | 2 + include/linux/mm_types.h | 11 +++++ include/linux/sched.h | 20 ++++++++ kernel/sched/core.c | 13 +++++ kernel/sched/fair.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 7 +++ kernel/sched/sched.h | 6 +++ kernel/sysctl.c | 24 ++++++++- mm/huge_memory.c | 5 +- mm/memory.c | 14 +++++- 11 files changed, 224 insertions(+), 4 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4dd..0f7c852f355c 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -111,6 +111,7 @@ config VSYSCALL config NUMA bool "Non Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL + select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n help Some SH systems have many various memories scattered around diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced2..1137028fc6d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,8 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3af7d94..ed8638c29b3e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -397,6 +397,17 @@ struct mm_struct { #endif #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; +#endif +#ifdef CONFIG_NUMA_BALANCING + /* + * numa_next_scan is the next time when the PTEs will me marked + * pte_numa to gather statistics and migrate pages to new nodes + * if necessary + */ + unsigned long numa_next_scan; + + /* numa_scan_seq prevents two threads setting pte_numa */ + int numa_scan_seq; #endif struct uprobes_state uprobes_state; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2e..844af5b12cb2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1479,6 +1479,14 @@ struct task_struct { short il_next; short pref_node_fork; #endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + int numa_migrate_seq; + unsigned int numa_scan_period; + u64 node_stamp; /* migration stamp */ + struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ + struct rcu_head rcu; /* @@ -1553,6 +1561,14 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages); +#else +static inline void task_numa_fault(int node, int pages) +{ +} +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_settle_count; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include @@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms: 5s + */ +unsigned int sysctl_numa_balancing_scan_period_min = 5000; +unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; + +static void task_numa_placement(struct task_struct *p) +{ + int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages) +{ + struct task_struct *p = current; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + task_numa_placement(p); +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + ACCESS_ONCE(mm->numa_scan_seq)++; + { + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + change_prot_numa(vma, vma->vm_start, vma->vm_end); + } + up_read(&mm->mmap_sem); + } +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, true) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#else +#define sched_feat_numa(x) (0) +#endif + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6f..ee8133794a56 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ split_huge_page(page); put_page(page); + return 0; clear_pmdnuma: @@ -1060,8 +1061,10 @@ clear_pmdnuma: out_unlock: spin_unlock(&mm->page_table_lock); - if (page) + if (page) { put_page(page); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + } return 0; } diff --git a/mm/memory.c b/mm/memory.c index d52542680e10..8012c1907895 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid, target_nid; + int current_nid = -1; + int target_nid; /* * The "pte" at this point cannot be used safely without @@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, current_nid = target_nid; out: + task_numa_fault(current_nid, 1); return 0; } @@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; + int curr_nid; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = vm_normal_page(vma, addr, pteval); if (unlikely(!page)) continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + pte_unmap_unlock(pte, ptl); + + curr_nid = page_to_nid(page); + task_numa_fault(curr_nid, 1); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); -- cgit From 03c5a6e16322c997bf8f264851bfa3f532ad515f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 14:52:48 +0000 Subject: mm: numa: Add pte updates, hinting and migration stats It is tricky to quantify the basic cost of automatic NUMA placement in a meaningful manner. This patch adds some vmstats that can be used as part of a basic costing model. u = basic unit = sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cpte = Cost PTE access = Ca Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock) where Cpte is incurred twice for a read and a write and Wlock is a constant representing the cost of taking or releasing a lock Cnumahint = Cost of a minor page fault = some high constant e.g. 1000 Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u Ci = Cost of page isolation = Ca + Wi where Wi is a constant that should reflect the approximate cost of the locking operation Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma) where Wnuma is the approximate NUMA factor. 1 is local. 1.2 would imply that remote accesses are 20% more expensive Balancing cost = Cpte * numa_pte_updates + Cnumahint * numa_hint_faults + Ci * numa_pages_migrated + Cpagecopy * numa_pages_migrated Note that numa_pages_migrated is used as a measure of how many pages were isolated even though it would miss pages that failed to migrate. A vmstat counter could have been added for it but the isolation cost is pretty marginal in comparison to the overall cost so it seemed overkill. The ideal way to measure automatic placement benefit would be to count the number of remote accesses versus local accesses and do something like benefit = (remote_accesses_before - remove_access_after) * Wnuma but the information is not readily available. As a workload converges, the expection would be that the number of remote numa hints would reduce to 0. convergence = numa_hint_faults_local / numa_hint_faults where this is measured for the last N number of numa hints recorded. When the workload is fully converged the value is 1. This can measure if the placement policy is converging and how fast it is doing it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- include/linux/vm_event_item.h | 6 ++++++ include/linux/vmstat.h | 8 ++++++++ mm/huge_memory.c | 5 +++++ mm/memory.c | 12 ++++++++++++ mm/mempolicy.c | 2 ++ mm/migrate.c | 3 ++- mm/vmstat.c | 6 ++++++ 7 files changed, 41 insertions(+), 1 deletion(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a1f750b8e72a..55600049e794 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,6 +38,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, +#ifdef CONFIG_NUMA_BALANCING + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, + NUMA_HINT_FAULTS_LOCAL, + NUMA_PAGE_MIGRATE, +#endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 92a86b2cce33..a13291f7da88 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu) #endif /* CONFIG_VM_EVENT_COUNTERS */ +#ifdef CONFIG_NUMA_BALANCING +#define count_vm_numa_event(x) count_vm_event(x) +#define count_vm_numa_events(x, y) count_vm_events(x, y) +#else +#define count_vm_numa_event(x) do {} while (0) +#define count_vm_numa_events(x, y) do {} while (0) +#endif /* CONFIG_NUMA_BALANCING */ + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee8133794a56..f3a477fffd09 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1026,6 +1026,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; + int current_nid = -1; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,6 +1035,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); spin_unlock(&mm->page_table_lock); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) diff --git a/mm/memory.c b/mm/memory.c index 8012c1907895..8a7b4ccbe136 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3477,6 +3477,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); + count_vm_numa_event(NUMA_HINT_FAULTS); page = vm_normal_page(vma, addr, pte); if (!page) { pte_unmap_unlock(ptep, ptl); @@ -3485,6 +3486,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(page); current_nid = page_to_nid(page); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { @@ -3517,6 +3520,9 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; + int local_nid = numa_node_id(); + unsigned long nr_faults = 0; + unsigned long nr_faults_local = 0; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3565,10 +3571,16 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, curr_nid = page_to_nid(page); task_numa_fault(curr_nid, 1); + nr_faults++; + if (curr_nid == local_nid) + nr_faults_local++; + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); + count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults); + count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local); return 0; } #else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a7a62fe7c280..516491fbfaa8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -583,6 +583,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); return nr_updated; } diff --git a/mm/migrate.c b/mm/migrate.c index c7d550011a64..23bba5d6edff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1514,7 +1514,8 @@ int migrate_misplaced_page(struct page *page, int node) if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; - } + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); } BUG_ON(!list_empty(&migratepages)); out: diff --git a/mm/vmstat.c b/mm/vmstat.c index 3a067fabe190..c0f1f6db5182 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,6 +774,12 @@ const char * const vmstat_text[] = { "pgrotated", +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif #ifdef CONFIG_MIGRATION "pgmigrate_success", "pgmigrate_fail", -- cgit From 5aa80374a10567f8e25de0615d3d40f3aa3a4298 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 27 Nov 2012 14:40:32 +0000 Subject: mm: numa: split_huge_page: Transfer last_nid on tail page Pass last_nid from head page to tail page. Signed-off-by: Hillf Danton Signed-off-by: Mel Gorman --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/huge_memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3a477fffd09..79b96064f8fc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1362,6 +1362,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); -- cgit From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- include/linux/mm_types.h | 3 +++ include/linux/sched.h | 5 +++-- kernel/sched/core.c | 1 + kernel/sched/fair.c | 29 +++++++++++++++++++++-------- kernel/sysctl.c | 7 +++++++ mm/huge_memory.c | 2 +- mm/memory.c | 12 ++++++++---- 7 files changed, 44 insertions(+), 15 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c5fffa239861..e850a23dd6ec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,6 +410,9 @@ struct mm_struct { */ unsigned long numa_next_scan; + /* numa_next_reset is when the PTE scanner period will be reset */ + unsigned long numa_next_reset; + /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d95a232b5b9..0f4ff2bd03f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1562,9 +1562,9 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages); +extern void task_numa_fault(int node, int pages, bool migrated); #else -static inline void task_numa_fault(int node, int pages) +static inline void task_numa_fault(int node, int pages, bool migrated) { } #endif @@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbfc4843063f..9d255bc0e278 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd18087fd369..4b577863933f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * numa task sample period in ms */ unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*16; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; /* FIXME: Allocate task-specific structure for placement policy here */ /* - * Assume that as faults occur that pages are getting properly placed - * and fewer NUMA hints are required. Note that this is a big - * assumption, it assumes processes reach a steady steady with no - * further phase changes. + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(2)); + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); } @@ -857,6 +857,19 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + /* * Enforce maximal scan/migration frequency.. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48a68cc258c1..8906f90d6fa2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79b96064f8fc..199b261a257e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1068,7 +1068,7 @@ out_unlock: spin_unlock(&mm->page_table_lock); if (page) { put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); } return 0; } diff --git a/mm/memory.c b/mm/memory.c index 84c6d9eab182..39edb11b63dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int current_nid = -1; int target_nid; + bool migrated = false; /* * The "pte" at this point cannot be used safely without @@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) current_nid = target_nid; out: if (current_nid != -1) - task_numa_fault(current_nid, 1); + task_numa_fault(current_nid, 1, migrated); return 0; } @@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; int curr_nid = local_nid; int target_nid; + bool migrated; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ pte_unmap_unlock(pte, ptl); - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) curr_nid = target_nid; - task_numa_fault(curr_nid, 1); + task_numa_fault(curr_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit From b32967ff101a7508f70be8de59b278d4df92fa00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 12:35:47 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case. Note: This is very heavily based on a patch from Peter Zijlstra with fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch put a lot of migration logic into mm/huge_memory.c where it does not belong. This version puts tries to share some of the migration logic with migrate_misplaced_page. However, it should be noted that now migrate.c is doing more with the pagetable manipulation than is preferred. The end result is barely recognisable so as before, the signed-offs had to be removed but will be re-added if the original authors are ok with it. Add THP migration for the NUMA working set scanning fault case. It uses the page lock to serialize. No migration pte dance is necessary because the pte is already unmapped when we decide to migrate. [dhillf@gmail.com: Fix memory leak on isolation failure] [dhillf@gmail.com: Fix transfer of last_nid information] Signed-off-by: Mel Gorman --- include/linux/migrate.h | 15 ++++ mm/huge_memory.c | 59 +++++++++---- mm/internal.h | 7 +- mm/memcontrol.c | 7 +- mm/migrate.c | 231 +++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 255 insertions(+), 64 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 91556889adac..51eac4bdc606 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, extern int migrate_misplaced_page(struct page *page, int node); extern int migrate_misplaced_page(struct page *page, int node); extern bool migrate_ratelimited(int node); +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node); + #else static inline int migrate_misplaced_page(struct page *page, int node) { @@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node) { return false; } + +static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + return -EAGAIN; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 199b261a257e..711baf84b153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -600,7 +600,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -1023,10 +1023,12 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page = NULL; + struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; + bool migrated; + bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); - spin_unlock(&mm->page_table_lock); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) + if (target_nid == -1) { + put_page(page); goto clear_pmdnuma; + } - /* - * Due to lacking code to migrate thp pages, we'll split - * (which preserves the special PROT_NONE) and re-take the - * fault on the normal pages. - */ - split_huge_page(page); - put_page(page); - - return 0; + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; -clear_pmdnuma: + /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); - if (page) { - put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); - } + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); return 0; } diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..7e60ac826f2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..d97af9636ab2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/mm/migrate.c b/mm/migrate.c index 2a5ce135eef0..c9400960fd52 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page)) + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node) return true; } -/* - * Attempt to migrate a misplaced page to the specified destination - * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. - */ -int migrate_misplaced_page(struct page *page, int node) +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat) { - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - LIST_HEAD(migratepages); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) { - put_page(page); - goto out; - } + bool rate_limited = false; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node) pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - spin_unlock(&pgdat->numabalancing_migrate_lock); - put_page(page); - goto out; - } - pgdat->numabalancing_migrate_nr_pages++; + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + rate_limited = true; + else + pgdat->numabalancing_migrate_nr_pages++; spin_unlock(&pgdat->numabalancing_migrate_lock); + + return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int ret = 0; /* Avoid migrating to a node that is nearly full */ if (migrate_balanced_pgdat(pgdat, 1)) { @@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node) if (isolate_lru_page(page)) { put_page(page); - goto out; + return 0; } - isolated = 1; + /* Page is isolated */ + ret = 1; page_lru = page_is_file_cache(page); - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - list_add(&page->lru, &migratepages); + if (!PageTransHuge(page)) + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + else + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + HPAGE_PMD_NR); } /* @@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node) */ put_page(page); - if (isolated) { - int nr_remaining; - - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); - if (nr_remaining) { - putback_lru_pages(&migratepages); - isolated = 0; - } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; } + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) { + put_page(page); + goto out; + } + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); out: return isolated; } + +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) + goto out_dropref; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + if (!new_page) + goto out_dropref; + page_xchg_last_nid(new_page, page_last_nid(page)); + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + put_page(new_page); + goto out_keep_locked; + } + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) { + spin_unlock(&mm->page_table_lock); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + unlock_page(page); + putback_lru_page(page); + + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + goto out; + } + + /* + * Traditional migration needs to prepare the memcg charge + * transaction early to prevent the old page from being + * uncharged when installing migration entries. Here we can + * save the potential rollback and start the charge transfer + * only when migration is already known to end successfully. + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mknonnuma(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + + page_add_new_anon_rmap(new_page, vma, haddr); + + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, entry); + page_remove_rmap(page); + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge + * before it's fully transferred to the new page. + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(&mm->page_table_lock); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_dropref: + put_page(page); +out_keep_locked: + return 0; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -- cgit From 5a505085f043e8380f83610f79642853c051e2f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:46 +0000 Subject: mm/rmap: Convert the struct anon_vma::mutex to an rwsem Convert the struct anon_vma::mutex to an rwsem, which will help in solving a page-migration scalability problem. (Addressed in a separate patch.) The conversion is simple and straightforward: in every case where we mutex_lock()ed we'll now down_write(). Suggested-by: Linus Torvalds Reviewed-by: Rik van Riel Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/linux/rmap.h | 16 ++++++++-------- mm/huge_memory.c | 4 ++-- mm/mmap.c | 8 ++++---- mm/rmap.c | 22 +++++++++++----------- 4 files changed, 25 insertions(+), 25 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bfe1f4780644..f3f41d242e25 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include /* @@ -25,8 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - struct anon_vma *root; /* Root of this anon_vma tree */ - struct mutex mutex; /* Serialize access to vma list */ + struct anon_vma *root; /* Root of this anon_vma tree */ + struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for @@ -64,7 +64,7 @@ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ - struct rb_node rb; /* locked by anon_vma->mutex */ + struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; @@ -108,24 +108,24 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } static inline void anon_vma_lock(struct anon_vma *anon_vma) { - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void anon_vma_unlock(struct anon_vma *anon_vma) { - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 711baf84b153..acd37fe55eb7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1292,7 +1292,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1495,7 +1495,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { diff --git a/mm/mmap.c b/mm/mmap.c index 9a796c41e7d9..88408632da66 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2561,15 +2561,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); + down_write(&anon_vma->root->rwsem); /* * We can safely modify head.next after taking the - * anon_vma->root->mutex. If some other vma in this mm shares + * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) @@ -2671,7 +2671,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) diff --git a/mm/rmap.c b/mm/rmap.c index 2ee1ef0f317b..6e3ee3b82798 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex - * anon_vma->mutex + * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@ * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock */ @@ -103,7 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ - if (mutex_is_locked(&anon_vma->root->mutex)) { + if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock(anon_vma); anon_vma_unlock(anon_vma); } @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); root = new_root; - mutex_lock(&root->mutex); + down_write(&root->rwsem); } return root; } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); } /* @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() - * needing to acquire the anon_vma->root->mutex. + * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - mutex_init(&anon_vma->mutex); + init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT; } @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (mutex_trylock(&root_anon_vma->mutex)) { + if (down_write_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - mutex_unlock(&root_anon_vma->mutex); + up_write(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -1299,7 +1299,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->mutex or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. -- cgit From 4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Dec 2012 19:56:50 +0000 Subject: mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable rmap_walk_anon() and try_to_unmap_anon() appears to be too careful about locking the anon vma: while it needs protection against anon vma list modifications, it does not need exclusive access to the list itself. Transforming this exclusive lock to a read-locked rwsem removes a global lock from the hot path of page-migration intense threaded workloads which can cause pathological performance like this: 96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch | --- perf_trace_sched_switch __schedule schedule schedule_preempt_disabled __mutex_lock_common.isra.6 __mutex_lock_slowpath mutex_lock | |--50.61%-- rmap_walk | move_to_new_page | migrate_pages | migrate_misplaced_page | __do_numa_page.isra.69 | handle_pte_fault | handle_mm_fault | __do_page_fault | do_page_fault | page_fault | __memset_sse2 | | | --100.00%-- worker_thread | | | --100.00%-- start_thread | --49.39%-- page_lock_anon_vma try_to_unmap_anon try_to_unmap migrate_pages migrate_misplaced_page __do_numa_page.isra.69 handle_pte_fault handle_mm_fault __do_page_fault do_page_fault page_fault __memset_sse2 | --100.00%-- worker_thread start_thread With this change applied the profile is now nicely flat and there's no anon-vma related scheduling/blocking. Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), to make it clearer that it's an exclusive write-lock in that case - suggested by Rik van Riel. Suggested-by: Linus Torvalds Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Christoph Lameter Cc: Rik van Riel Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 2 +- include/linux/rmap.h | 17 ++++++++++++++--- mm/huge_memory.c | 6 +++--- mm/ksm.c | 6 +++--- mm/memory-failure.c | 4 ++-- mm/migrate.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/rmap.c | 48 ++++++++++++++++++++++++------------------------ 9 files changed, 50 insertions(+), 39 deletions(-) (limited to 'mm/huge_memory.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 027ad04ef3a8..0d1208c0bdc4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #define wait_split_huge_page(__anon_vma, __pmd) \ do { \ pmd_t *____pmd = (__pmd); \ - anon_vma_lock(__anon_vma); \ + anon_vma_lock_write(__anon_vma); \ anon_vma_unlock(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index f3f41d242e25..c20635c527a9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) up_write(&anon_vma->root->rwsem); } -static inline void anon_vma_lock(struct anon_vma *anon_vma) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); } @@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma) up_write(&anon_vma->root->rwsem); } +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + + /* * anon_vma helper functions. */ @@ -220,8 +231,8 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma_read(struct page *page); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acd37fe55eb7..a24c9cb9c83e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page) int ret = 1; BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/mm/ksm.c b/mm/ksm.c index ae539f0b8aa1..7fa37de1ee0c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1634,7 +1634,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1688,7 +1688,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1741,7 +1741,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ddb68a169e45..f2cd830f66c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* diff --git a/mm/migrate.c b/mm/migrate.c index f24e9cc49cc4..6e46485f014c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (PageAnon(page)) { /* - * Only page_lock_anon_vma() understands the subtleties of + * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); diff --git a/mm/mmap.c b/mm/mmap.c index 88408632da66..68a16b40c209 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) anon_vma_interval_tree_pre_update_vma(next); diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..3dabd170753a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } if (vma->anon_vma) { anon_vma = vma->anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); } } diff --git a/mm/rmap.c b/mm/rmap.c index 6e3ee3b82798..b0f612df7b9d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma() such that + * Synchronize against page_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * mutex_trylock() from page_lock_anon_vma(). This orders: + * down_read_trylock() from page_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma() VS put_anon_vma() - * mutex_trylock() atomic_dec_and_test() + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() mutex_is_locked() + * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ if (rwsem_is_locked(&anon_vma->root->rwsem)) { - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_unlock(anon_vma); } @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma_unlock(anon_vma); @@ -442,7 +442,7 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (down_write_trylock(&root_anon_vma->rwsem)) { + if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - up_write(&root_anon_vma->rwsem); + up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock() recursion. + * we'll deadlock on the anon_vma_lock_write() recursion. */ - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -504,9 +504,9 @@ out: return anon_vma; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } /* @@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page, struct anon_vma_chain *avc; int referenced = 0; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return referenced; @@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page, break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return referenced; } @@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return ret; @@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return ret; } @@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, int ret = SWAP_AGAIN; /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing @@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); return ret; } -- cgit