diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/hmm.c | 470 | ||||
| -rw-r--r-- | mm/huge_memory.c | 3 | ||||
| -rw-r--r-- | mm/hugetlb_cgroup.c | 3 | ||||
| -rw-r--r-- | mm/madvise.c | 12 | ||||
| -rw-r--r-- | mm/memcontrol.c | 159 | ||||
| -rw-r--r-- | mm/memory.c | 35 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 8 | ||||
| -rw-r--r-- | mm/memremap.c | 4 | ||||
| -rw-r--r-- | mm/migrate.c | 9 | ||||
| -rw-r--r-- | mm/mmap.c | 4 | ||||
| -rw-r--r-- | mm/mmu_notifier.c | 27 | ||||
| -rw-r--r-- | mm/mprotect.c | 38 | ||||
| -rw-r--r-- | mm/mremap.c | 11 | ||||
| -rw-r--r-- | mm/nommu.c | 10 | ||||
| -rw-r--r-- | mm/shmem.c | 2 | ||||
| -rw-r--r-- | mm/slub.c | 41 | ||||
| -rw-r--r-- | mm/sparse.c | 16 | ||||
| -rw-r--r-- | mm/swapfile.c | 41 | ||||
| -rw-r--r-- | mm/vmalloc.c | 11 | ||||
| -rw-r--r-- | mm/vmscan.c | 9 | ||||
| -rw-r--r-- | mm/z3fold.c | 1 |
21 files changed, 496 insertions, 418 deletions
@@ -28,41 +28,25 @@ struct hmm_vma_walk { struct hmm_range *range; - struct dev_pagemap *pgmap; unsigned long last; - unsigned int flags; }; -static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, - bool write_fault, uint64_t *pfn) -{ - unsigned int flags = FAULT_FLAG_REMOTE; - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - vm_fault_t ret; - - if (!vma) - goto err; - - if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) - flags |= FAULT_FLAG_ALLOW_RETRY; - if (write_fault) - flags |= FAULT_FLAG_WRITE; - - ret = handle_mm_fault(vma, addr, flags); - if (ret & VM_FAULT_RETRY) { - /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ - return -EAGAIN; - } - if (ret & VM_FAULT_ERROR) - goto err; - - return -EBUSY; +enum { + HMM_NEED_FAULT = 1 << 0, + HMM_NEED_WRITE_FAULT = 1 << 1, + HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, +}; -err: - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; +/* + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn + * @range: range use to encode HMM pfn value + * @pfn: pfn value for which to create the device entry + * Return: valid device entry for the pfn + */ +static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) +{ + return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } static int hmm_pfns_fill(unsigned long addr, unsigned long end, @@ -79,56 +63,43 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, } /* - * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) - * @fault: should we fault or not ? - * @write_fault: write fault ? + * @required_fault: HMM_NEED_* flags * @walk: mm_walk structure - * Return: 0 on success, -EBUSY after page fault, or page fault error + * Return: -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. */ -static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, - bool fault, bool write_fault, - struct mm_walk *walk) +static int hmm_vma_fault(unsigned long addr, unsigned long end, + unsigned int required_fault, struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long i; + struct vm_area_struct *vma = walk->vma; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + WARN_ON_ONCE(!required_fault); hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - - if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) - return -EPERM; - - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = range->values[HMM_PFN_NONE]; - if (fault || write_fault) { - int ret; - ret = hmm_vma_do_fault(walk, addr, write_fault, - &pfns[i]); - if (ret != -EBUSY) - return ret; - } + if (required_fault & HMM_NEED_WRITE_FAULT) { + if (!(vma->vm_flags & VM_WRITE)) + return -EPERM; + fault_flags |= FAULT_FLAG_WRITE; } - return (fault || write_fault) ? -EBUSY : 0; + for (; addr < end; addr += PAGE_SIZE) + if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) + return -EFAULT; + return -EBUSY; } -static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags, - bool *fault, bool *write_fault) +static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) - return; - /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can @@ -143,46 +114,44 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) - return; - /* If this is device memory then only fault if explicitly requested */ - if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { - /* Do we fault on device memory ? */ - if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { - *write_fault = pfns & range->flags[HMM_PFN_WRITE]; - *fault = true; - } - return; - } + return 0; - /* If CPU page table is not valid then we need to fault */ - *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); /* Need to write fault ? */ if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) { - *write_fault = true; - *fault = true; - } + !(cpu_flags & range->flags[HMM_PFN_WRITE])) + return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; + + /* If CPU page table is not valid then we need to fault */ + if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + return HMM_NEED_FAULT; + return 0; } -static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags, bool *fault, - bool *write_fault) +static unsigned int +hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags) { + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault = 0; unsigned long i; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { - *fault = *write_fault = false; - return; - } + /* + * If the default flags do not request to fault pages, and the mask does + * not allow for individual pages to be faulted, then + * hmm_pte_need_fault() will always return 0. + */ + if (!((range->default_flags | range->pfn_flags_mask) & + range->flags[HMM_PFN_VALID])) + return 0; - *fault = *write_fault = false; for (i = 0; i < npages; ++i) { - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, - fault, write_fault); - if ((*write_fault)) - return; + required_fault |= + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + if (required_fault == HMM_NEED_ALL_BITS) + return required_fault; } + return required_fault; } static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, @@ -190,16 +159,23 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; unsigned long i, npages; uint64_t *pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; pfns = &range->pfns[i]; - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + if (!walk->vma) { + if (required_fault) + return -EFAULT; + return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); + } + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); + hmm_vma_walk->last = addr; + return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); } static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) @@ -218,31 +194,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, - &fault, &write_fault); - - if (pmd_protnone(pmd) || fault || write_fault) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { - if (pmd_devmap(pmd)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; return 0; } @@ -252,6 +216,14 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool hmm_is_device_private_entry(struct hmm_range *range, + swp_entry_t entry) +{ + return is_device_private_entry(entry) && + device_private_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner; +} + static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) @@ -267,102 +239,81 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; pte_t pte = *ptep; uint64_t orig_pfn = *pfn; - *pfn = range->values[HMM_PFN_NONE]; - fault = write_fault = false; - if (pte_none(pte)) { - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, - &fault, &write_fault); - if (fault || write_fault) + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (required_fault) goto fault; + *pfn = range->values[HMM_PFN_NONE]; return 0; } if (!pte_present(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (!non_swap_entry(entry)) { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; - return 0; - } - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. + * Never fault in device private pages pages, but just report + * the PFN even if not present. */ - if (is_device_private_entry(entry)) { - cpu_flags = range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_DEVICE_PRIVATE]; - cpu_flags |= is_write_device_private_entry(entry) ? - range->flags[HMM_PFN_WRITE] : 0; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; + if (hmm_is_device_private_entry(range, entry)) { *pfn = hmm_device_entry_from_pfn(range, - swp_offset(entry)); - *pfn |= cpu_flags; + device_private_entry_to_pfn(entry)); + *pfn |= range->flags[HMM_PFN_VALID]; + if (is_write_device_private_entry(entry)) + *pfn |= range->flags[HMM_PFN_WRITE]; return 0; } - if (is_migration_entry(entry)) { - if (fault || write_fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(walk->mm, pmdp, addr); - return -EBUSY; - } + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (!required_fault) { + *pfn = range->values[HMM_PFN_NONE]; return 0; } + if (!non_swap_entry(entry)) + goto fault; + + if (is_migration_entry(entry)) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(walk->mm, pmdp, addr); + return -EBUSY; + } + /* Report error for everything else */ - *pfn = range->values[HMM_PFN_ERROR]; + pte_unmap(ptep); return -EFAULT; - } else { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); } - if (fault || write_fault) + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) goto fault; - if (pte_devmap(pte)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - if (!is_zero_pfn(pte_pfn(pte))) { - *pfn = range->values[HMM_PFN_SPECIAL]; + /* + * Since each architecture defines a struct page for the zero page, just + * fall through and treat it like a normal page. + */ + if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + pte_unmap(ptep); return -EFAULT; } - /* - * Since each architecture defines a struct page for the zero - * page, just fall through and treat it like a normal page. - */ + *pfn = range->values[HMM_PFN_SPECIAL]; + return 0; } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + return hmm_vma_fault(addr, end, required_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -372,8 +323,9 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long addr = start, i; + uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; pte_t *ptep; pmd_t pmd; @@ -383,24 +335,19 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - bool fault, write_fault; - unsigned long npages; - uint64_t *pfns; - - i = (addr - range->start) >> PAGE_SHIFT; - npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - if (fault || write_fault) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return 0; - } else if (!pmd_present(pmd)) + return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + } + + if (!pmd_present(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -417,8 +364,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - i = (addr - range->start) >> PAGE_SHIFT; - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); + return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); } /* @@ -427,31 +373,23 @@ again: * entry pointing to pte directory or it is a bad pmd that will not * recover. */ - if (pmd_bad(pmd)) + if (pmd_bad(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } ptep = pte_offset_map(pmdp, addr); - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); if (r) { - /* hmm_vma_handle_pte() did unmap pte directory */ + /* hmm_vma_handle_pte() did pte_unmap() */ hmm_vma_walk->last = addr; return r; } } - if (hmm_vma_walk->pgmap) { - /* - * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() - * so that we can leverage get_dev_pagemap() optimization which - * will not re-take a reference on a pgmap if we already have - * one. - */ - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; @@ -487,18 +425,18 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pud = READ_ONCE(*pudp); if (pud_none(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; + unsigned int required_fault; uint64_t *pfns, cpu_flags; - bool fault, write_fault; if (!pud_present(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } i = (addr - range->start) >> PAGE_SHIFT; @@ -506,29 +444,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfns = &range->pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - cpu_flags, &fault, &write_fault); - if (fault || write_fault) { - ret = hmm_vma_walk_hole_(addr, end, fault, - write_fault, walk); - goto out_unlock; + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + npages, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) { - ret = -EBUSY; - goto out_unlock; - } + for (i = 0; i < npages; ++i, ++pfn) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; goto out_unlock; } @@ -554,24 +480,20 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; uint64_t orig_pfn, cpu_flags; - bool fault, write_fault; + unsigned int required_fault; spinlock_t *ptl; pte_t entry; - int ret = 0; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; orig_pfn = range->pfns[i]; - range->pfns[i] = range->values[HMM_PFN_NONE]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - fault = write_fault = false; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) { - ret = -ENOENT; - goto unlock; + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); @@ -579,14 +501,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; - -unlock: spin_unlock(ptl); - - if (ret == -ENOENT) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); - - return ret; + return 0; } #else #define hmm_vma_walk_hugetlb_entry NULL @@ -599,40 +515,32 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - /* - * Skip vma ranges that don't have struct page backing them or - * map I/O devices directly. - */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) - return -EFAULT; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + vma->vm_flags & VM_READ) + return 0; /* + * vma ranges that don't have struct page backing them or map I/O + * devices directly cannot be handled by hmm_range_fault(). + * * If the vma does not allow read access, then assume that it does not - * allow write access either. HMM does not support architectures - * that allow write without read. + * allow write access either. HMM does not support architectures that + * allow write without read. + * + * If a fault is requested for an unsupported range then it is a hard + * failure. */ - if (!(vma->vm_flags & VM_READ)) { - bool fault, write_fault; - - /* - * Check to see if a fault is requested for any page in the - * range. - */ - hmm_range_need_fault(hmm_vma_walk, range->pfns + - ((start - range->start) >> PAGE_SHIFT), - (end - start) >> PAGE_SHIFT, - 0, &fault, &write_fault); - if (fault || write_fault) - return -EFAULT; - - hmm_pfns_fill(start, end, range, HMM_PFN_NONE); - hmm_vma_walk->last = end; + if (hmm_range_need_fault(hmm_vma_walk, + range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, 0)) + return -EFAULT; - /* Skip this vma and continue processing the next vma. */ - return 1; - } + hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + hmm_vma_walk->last = end; - return 0; + /* Skip this vma and continue processing the next vma. */ + return 1; } static const struct mm_walk_ops hmm_walk_ops = { @@ -645,8 +553,7 @@ static const struct mm_walk_ops hmm_walk_ops = { /** * hmm_range_fault - try to fault some address in a virtual address range - * @range: range being faulted - * @flags: HMM_FAULT_* flags + * @range: argument structure * * Return: the number of valid pages in range->pfns[] (from range start * address), which may be zero. On error one of the following status codes @@ -657,26 +564,19 @@ static const struct mm_walk_ops hmm_walk_ops = { * -ENOMEM: Out of memory. * -EPERM: Invalid permission (e.g., asking for write and range is read * only). - * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. * -EBUSY: The range has been invalidated and the caller needs to wait for * the invalidation to finish. - * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access - * that range) number of valid pages in range->pfns[] (from - * range start address). - * - * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs - * and caller does not ask for migration. + * -EFAULT: A page was requested to be valid and could not be made valid + * ie it has no backing VMA or it is illegal to access * - * On error, for one virtual address in the range, the function will mark the - * corresponding HMM pfn entry with an error flag. + * This is similar to get_user_pages(), except that it can read the page tables + * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range, unsigned int flags) +long hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, .last = range->start, - .flags = flags, }; struct mm_struct *mm = range->notifier->mm; int ret; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b08b199f9a11..24ad53b4dfc0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3043,8 +3043,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, return; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); - pmdval = *pvmw->pmd; - pmdp_invalidate(vma, address, pvmw->pmd); + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); if (pmd_dirty(pmdval)) set_page_dirty(page); entry = make_migration_entry(page, pmd_write(pmdval)); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e434b05416c6..5280bcf459af 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -240,8 +240,7 @@ again: if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) { ret = -ENOMEM; - hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx, - HUGETLB_MAX); + hugetlb_event(h_cg, idx, HUGETLB_MAX); } css_put(&h_cg->css); done: diff --git a/mm/madvise.c b/mm/madvise.c index 43b47d3fae02..4bb30ed6c8d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, } page = pmd_page(orig_pmd); + + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + goto huge_unlock; + if (next - addr != HPAGE_PMD_SIZE) { int err; - if (page_mapcount(page) != 1) - goto huge_unlock; - get_page(page); spin_unlock(ptl); lock_page(page); @@ -426,6 +428,10 @@ regular_page: continue; } + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + continue; + VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f6dc8712e39..7ddf91c4295f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -409,8 +409,10 @@ int memcg_expand_shrinker_maps(int new_id) if (mem_cgroup_is_root(memcg)) continue; ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) + if (ret) { + mem_cgroup_iter_break(NULL, memcg); goto unlock; + } } unlock: if (!ret) @@ -775,6 +777,17 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } +void mod_memcg_obj_state(void *p, int idx, int val) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_obj(p); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -2295,28 +2308,41 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_SCALING_SHIFT 14 /* - * Scheduled by try_charge() to be executed from the userland return path - * and reclaims memory over the high limit. + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. */ -void mem_cgroup_handle_over_high(void) +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages) { - unsigned long usage, high, clamped_high; - unsigned long pflags; - unsigned long penalty_jiffies, overage; - unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg; + unsigned long penalty_jiffies; + u64 max_overage = 0; - if (likely(!nr_pages)) - return; + do { + unsigned long usage, high; + u64 overage; - memcg = get_mem_cgroup_from_mm(current->mm); - reclaim_high(memcg, nr_pages, GFP_KERNEL); - current->memcg_nr_pages_over_high = 0; + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); + + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + overage = div64_u64(overage, high); + + if (overage > max_overage) + max_overage = overage; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + if (!max_overage) + return 0; /* - * memory.high is breached and reclaim is unable to keep up. Throttle - * allocators proactively to slow down excessive growth. - * * We use overage compared to memory.high to calculate the number of * jiffies to sleep (penalty_jiffies). Ideally this value should be * fairly lenient on small overages, and increasingly harsh when the @@ -2324,24 +2350,9 @@ void mem_cgroup_handle_over_high(void) * its crazy behaviour, so we exponentially increase the delay based on * overage amount. */ - - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); - - if (usage <= high) - goto out; - - /* - * Prevent division by 0 in overage calculation by acting as if it was a - * threshold of 1 page - */ - clamped_high = max(high, 1UL); - - overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, - clamped_high); - - penalty_jiffies = ((u64)overage * overage * HZ) - >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + penalty_jiffies = max_overage * max_overage * HZ; + penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; + penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; /* * Factor in the task's own contribution to the overage, such that four @@ -2358,7 +2369,32 @@ void mem_cgroup_handle_over_high(void) * application moving forwards and also permit diagnostics, albeit * extremely slowly. */ - penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); +} + +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned long penalty_jiffies; + unsigned long pflags; + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg; + + if (likely(!nr_pages)) + return; + + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); + current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + */ + penalty_jiffies = calculate_high_delay(memcg, nr_pages); /* * Don't sleep if the amount of jiffies this memcg owes us is so low @@ -2636,6 +2672,33 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_obj(void *p) +{ + struct page *page; + + if (mem_cgroup_disabled()) + return NULL; + + page = virt_to_head_page(p); + + /* + * Slab pages don't have page->mem_cgroup set because corresponding + * kmem caches can be reparented during the lifetime. That's why + * memcg_from_slab_page() should be used instead. + */ + if (PageSlab(page)) + return memcg_from_slab_page(page); + + /* All other pages use page->mem_cgroup */ + return page->mem_cgroup; +} + static int memcg_alloc_cache_id(void) { int id, size; @@ -4025,7 +4088,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; unsigned long usage; - int i, j, size; + int i, j, size, entries; mutex_lock(&memcg->thresholds_lock); @@ -4045,14 +4108,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, __mem_cgroup_threshold(memcg, type == _MEMSWAP); /* Calculate new number of threshold */ - size = 0; + size = entries = 0; for (i = 0; i < thresholds->primary->size; i++) { if (thresholds->primary->entries[i].eventfd != eventfd) size++; + else + entries++; } new = thresholds->spare; + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { kfree(new); @@ -6680,19 +6749,9 @@ void mem_cgroup_sk_alloc(struct sock *sk) if (!mem_cgroup_sockets_enabled) return; - /* - * Socket cloning can throw us here with sk_memcg already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_memcg) { - css_get(&sk->sk_memcg->css); + /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (in_interrupt()) return; - } rcu_read_lock(); memcg = mem_cgroup_from_task(current); diff --git a/mm/memory.c b/mm/memory.c index 0bccc622e482..e8bfdf0d9d1d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2257,7 +2257,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, bool ret; void *kaddr; void __user *uaddr; - bool force_mkyoung; + bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; @@ -2282,11 +2282,11 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); - if (force_mkyoung) { + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault @@ -2310,18 +2310,37 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + if (locked) + goto warn; + + /* Re-validate under PTL if the page is still mapped */ + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* The PTE changed under us. Retry page fault. */ + ret = false; + goto pte_unlock; + } + /* - * Give a warn in case there can be some obscure - * use-case + * The same page can be mapped back since last copy attampt. + * Try to copy again under PTL. */ - WARN_ON_ONCE(1); - clear_page(kaddr); + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + /* + * Give a warn in case there can be some obscure + * use-case + */ +warn: + WARN_ON_ONCE(1); + clear_page(kaddr); + } } ret = true; pte_unlock: - if (force_mkyoung) + if (locked) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0a54ffac8c68..19389cdc16a5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -574,7 +574,13 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void generic_online_page(struct page *page, unsigned int order) { - kernel_map_pages(page, 1 << order, 1); + /* + * Freeing the page with debug_pagealloc enabled will try to unmap it, + * so we should map it first. This is better than introducing a special + * case in page freeing fast path. + */ + if (debug_pagealloc_enabled_static()) + kernel_map_pages(page, 1 << order, 1); __free_pages_core(page, order); totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM diff --git a/mm/memremap.c b/mm/memremap.c index 09b5b7adc773..9b2c97ceb775 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || diff --git a/mm/migrate.c b/mm/migrate.c index b1092876e537..7605d2c23433 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2241,7 +2241,7 @@ again: arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn, pfn; + unsigned long mpfn = 0, pfn; struct page *page; swp_entry_t entry; pte_t pte; @@ -2255,8 +2255,6 @@ again: } if (!pte_present(pte)) { - mpfn = 0; - /* * Only care about unaddressable device page special * page table entry. Other special swap entries are not @@ -2267,11 +2265,16 @@ again: goto next; page = device_private_entry_to_page(entry); + if (page->pgmap->owner != migrate->src_owner) + goto next; + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + if (migrate->src_owner) + goto next; pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; diff --git a/mm/mmap.c b/mm/mmap.c index 6756b8bb0033..d681a20eb4ea 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -195,8 +195,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); - brk = untagged_addr(brk); - if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -1557,8 +1555,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; - addr = untagged_addr(addr); - if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index ef3973a5d34a..06852b896fa6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all @@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_flush_young) young |= subscription->ops->clear_flush_young( subscription, mm, start, end); @@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->clear_young) young |= subscription->ops->clear_young(subscription, mm, start, end); @@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->test_young) { young = subscription->ops->test_young(subscription, mm, address); @@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->change_pte) subscription->ops->change_pte(subscription, mm, address, pte); @@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start( int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { const struct mmu_notifier_ops *ops = subscription->ops; if (ops->invalidate_range_start) { @@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { if (subscription->ops->invalidate_range) subscription->ops->invalidate_range(subscription, mm, start, end); @@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) spin_lock(&mm->notifier_subscriptions->lock); hlist_for_each_entry_rcu(subscription, - &mm->notifier_subscriptions->list, hlist) { + &mm->notifier_subscriptions->list, hlist, + lockdep_is_held(&mm->notifier_subscriptions->lock)) { if (subscription->ops != ops) continue; diff --git a/mm/mprotect.c b/mm/mprotect.c index 7a8e84f86831..311c0dadf71c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -161,6 +161,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return pages; } +/* + * Used when setting automatic NUMA hinting protection where it is + * critical that a numa hinting PMD is not confused with a bad PMD. + */ +static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) +{ + pmd_t pmdval = pmd_read_atomic(pmd); + + /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +#endif + + if (pmd_none(pmdval)) + return 1; + if (pmd_trans_huge(pmdval)) + return 0; + if (unlikely(pmd_bad(pmdval))) { + pmd_clear_bad(pmd); + return 1; + } + + return 0; +} + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -178,8 +203,17 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) - && pmd_none_or_clear_bad(pmd)) + + /* + * Automatic NUMA balancing walks the tables with mmap_sem + * held for read. It's possible a parallel update to occur + * between pmd_trans_huge() and a pmd_none_or_clear_bad() + * check leading to a false positive and clearing. + * Hence, it's necessary to atomically read the PMD value + * for all the checks. + */ + if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && + pmd_none_or_clear_bad_unless_trans_huge(pmd)) goto next; /* invoke the mmu notifier if the pmd is populated */ diff --git a/mm/mremap.c b/mm/mremap.c index 122938dcec15..d28f08a36b96 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,8 +606,17 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + /* + * There is a deliberate asymmetry here: we strip the pointer tag + * from the old address but leave the new address alone. This is + * for consistency with mmap(), where we prevent the creation of + * aliasing mappings in userspace by leaving the tag bits of the + * mapping address intact. A non-zero tag will cause the subsequent + * range checks to reject the address as invalid. + * + * See Documentation/arm64/tagged-address-abi.rst for more information. + */ addr = untagged_addr(addr); - new_addr = untagged_addr(new_addr); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/nommu.c b/mm/nommu.c index bd2b4e5ef144..318df4e236c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -370,10 +370,14 @@ void vm_unmap_aliases(void) EXPORT_SYMBOL_GPL(vm_unmap_aliases); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement a stub for vmalloc_sync_[un]mapping() if the architecture + * chose not to have one. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) +{ +} + +void __weak vmalloc_sync_unmappings(void) { } diff --git a/mm/shmem.c b/mm/shmem.c index c8f7540ef048..aad3ba74b0e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3386,8 +3386,6 @@ static const struct constant_table shmem_param_enums_huge[] = { {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, - {"deny", SHMEM_HUGE_DENY }, - {"force", SHMEM_HUGE_FORCE }, {} }; diff --git a/mm/slub.c b/mm/slub.c index 17dc00e33115..6589b41d5a60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1973,8 +1973,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - else if (!node_present_pages(node)) - searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -2563,17 +2561,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct page *page; page = c->page; - if (!page) + if (!page) { + /* + * if the node is not online or has no normal memory, just + * ignore the node constraint + */ + if (unlikely(node != NUMA_NO_NODE && + !node_state(node, N_NORMAL_MEMORY))) + node = NUMA_NO_NODE; goto new_slab; + } redo: if (unlikely(!node_match(page, node))) { - int searchnode = node; - - if (node != NUMA_NO_NODE && !node_present_pages(node)) - searchnode = node_to_mem_node(node); - - if (unlikely(!node_match(page, searchnode))) { + /* + * same as above but node_match() being false already + * implies node != NUMA_NO_NODE + */ + if (!node_state(node, N_NORMAL_MEMORY)) { + node = NUMA_NO_NODE; + goto redo; + } else { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, page, c->freelist, c); goto new_slab; @@ -2997,11 +3005,13 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, tail_obj, c->freelist); + void **freelist = READ_ONCE(c->freelist); + + set_freepointer(s, tail_obj, freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, - c->freelist, tid, + freelist, tid, head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); @@ -3175,6 +3185,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!object)) { /* + * We may have removed an object from c->freelist using + * the fastpath in the previous iteration; in that case, + * c->tid has not been bumped yet. + * Since ___slab_alloc() may reenable interrupts while + * allocating memory, we should bump c->tid now. + */ + c->tid = next_tid(c->tid); + + /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ diff --git a/mm/sparse.c b/mm/sparse.c index c184b69460b7..65599e8bd636 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -734,6 +734,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); struct page *memmap = NULL; + bool empty; unsigned long *subsection_map = ms->usage ? &ms->usage->subsection_map[0] : NULL; @@ -764,7 +765,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified */ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION); + if (empty) { unsigned long section_nr = pfn_to_section_nr(pfn); /* @@ -779,13 +781,21 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, ms->usage = NULL; } memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - ms->section_mem_map = (unsigned long)NULL; + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; } if (section_is_early && memmap) free_map_bootmem(memmap); else depopulate_section_memmap(pfn, nr_pages, altmap); + + if (empty) + ms->section_mem_map = (unsigned long)NULL; } static struct page * __meminit section_activate(int nid, unsigned long pfn, @@ -876,7 +886,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2c33ff456ed5..be33e6176cd9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2899,10 +2899,6 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->bdev = inode->i_sb->s_bdev; } - inode_lock(inode); - if (IS_SWAPFILE(inode)) - return -EBUSY; - return 0; } @@ -3157,36 +3153,41 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; + inode_lock(inode); + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap_unlock_inode; + } + /* * Read the swap header. */ if (!mapping->a_ops->readpage) { error = -EINVAL; - goto bad_swap; + goto bad_swap_unlock_inode; } page = read_mapping_page(mapping, 0, swap_file); if (IS_ERR(page)) { error = PTR_ERR(page); - goto bad_swap; + goto bad_swap_unlock_inode; } swap_header = kmap(page); maxpages = read_swap_header(p, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; - goto bad_swap; + goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; - goto bad_swap; + goto bad_swap_unlock_inode; } if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) @@ -3211,7 +3212,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; - goto bad_swap; + goto bad_swap_unlock_inode; } for (ci = 0; ci < nr_cluster; ci++) @@ -3220,7 +3221,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; - goto bad_swap; + goto bad_swap_unlock_inode; } for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; @@ -3234,13 +3235,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = swap_cgroup_swapon(p->type, maxpages); if (error) - goto bad_swap; + goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; - goto bad_swap; + goto bad_swap_unlock_inode; } /* frontswap enabled? set up bit-per-page map for frontswap */ if (IS_ENABLED(CONFIG_FRONTSWAP)) @@ -3280,7 +3281,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = init_swap_address_space(p->type, maxpages); if (error) - goto bad_swap; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3290,7 +3291,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto bad_swap; + goto bad_swap_unlock_inode; } mutex_lock(&swapon_mutex); @@ -3315,6 +3316,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +bad_swap_unlock_inode: + inode_unlock(inode); bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; @@ -3322,6 +3325,7 @@ bad_swap: set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } + inode = NULL; destroy_swap_extents(p); swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); @@ -3333,13 +3337,8 @@ bad_swap: kvfree(frontswap_map); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); - if (swap_file) { - if (inode) { - inode_unlock(inode); - inode = NULL; - } + if (swap_file) filp_close(swap_file, NULL); - } out: if (page && !IS_ERR(page)) { kunmap(page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f46c3b86f9f..6b8eeb0ecee5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) * First make sure the mappings are removed from all page-tables * before they are freed. */ - vmalloc_sync_all(); + vmalloc_sync_unmappings(); /* * TODO: to calculate a flush range without looping. @@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, EXPORT_SYMBOL(remap_vmalloc_range); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose + * not to have one. * * The purpose of this function is to make sure the vmalloc area * mappings are identical in all page-tables in the system. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) { } +void __weak vmalloc_sync_unmappings(void) +{ +} static int f(pte_t *pte, unsigned long addr, void *data) { diff --git a/mm/vmscan.c b/mm/vmscan.c index c05eb9efec07..876370565455 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2415,10 +2415,13 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. - * Make sure we don't miss the last page - * because of a round-off error. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. */ - scan = DIV64_U64_ROUND_UP(scan * fraction[file], + scan = mem_cgroup_online(memcg) ? + div64_u64(scan * fraction[file], denominator) : + DIV64_U64_ROUND_UP(scan * fraction[file], denominator); break; case SCAN_FILE: diff --git a/mm/z3fold.c b/mm/z3fold.c index 43754d8ebce8..42f31c4b53ad 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,7 +41,6 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/rwlock.h> #include <linux/zpool.h> #include <linux/magic.h> |