diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 108 |
1 files changed, 72 insertions, 36 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d36b2af4d1bf..f3c4f9d22821 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -240,18 +240,18 @@ static ssize_t defrag_store(struct kobject *kobj, clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); } else if (!memcmp("defer+madvise", buf, min(sizeof("defer+madvise")-1, count))) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); } else if (!memcmp("madvise", buf, min(sizeof("madvise")-1, count))) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); @@ -1568,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, deactivate_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); @@ -1724,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - int ret = 0; + pmd_t entry; + bool preserve_write; + int ret; ptl = __pmd_trans_huge_lock(pmd, vma); - if (ptl) { - pmd_t entry; - bool preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; + if (!ptl) + return 0; - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; - if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && - pmd_write(entry)); - } - spin_unlock(ptl); - } + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + + if (prot_numa && pmd_protnone(*pmd)) + goto unlock; + + /* + * In case prot_numa, we are under down_read(mmap_sem). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under down_read(mmap_sem): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + entry = *pmd; + pmdp_invalidate(vma, addr, pmd); + + /* + * Recover dirty/young flags. It relies on pmdp_invalidate to not + * corrupt them. + */ + if (pmd_dirty(*pmd)) + entry = pmd_mkdirty(entry); + if (pmd_young(*pmd)) + entry = pmd_mkyoung(entry); + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); return ret; } @@ -1828,7 +1859,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); - count_vm_event(THP_SPLIT_PMD); + count_vm_event(THP_SPLIT_PUD); pudp_huge_clear_flush_notify(vma, haddr, pud); } @@ -2048,6 +2079,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -2055,7 +2087,11 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, if (!pgd_present(*pgd)) return; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return; |