aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hmm.c470
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/hugetlb_cgroup.c3
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memcontrol.c159
-rw-r--r--mm/memory.c35
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/memremap.c4
-rw-r--r--mm/migrate.c9
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mmu_notifier.c27
-rw-r--r--mm/mprotect.c38
-rw-r--r--mm/mremap.c11
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slub.c41
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/vmalloc.c11
-rw-r--r--mm/vmscan.c9
-rw-r--r--mm/z3fold.c1
21 files changed, 496 insertions, 418 deletions
diff --git a/mm/hmm.c b/mm/hmm.c
index 72e5a6d9a417..280585833adf 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -28,41 +28,25 @@
struct hmm_vma_walk {
struct hmm_range *range;
- struct dev_pagemap *pgmap;
unsigned long last;
- unsigned int flags;
};
-static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
- bool write_fault, uint64_t *pfn)
-{
- unsigned int flags = FAULT_FLAG_REMOTE;
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
- struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
- vm_fault_t ret;
-
- if (!vma)
- goto err;
-
- if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
- flags |= FAULT_FLAG_ALLOW_RETRY;
- if (write_fault)
- flags |= FAULT_FLAG_WRITE;
-
- ret = handle_mm_fault(vma, addr, flags);
- if (ret & VM_FAULT_RETRY) {
- /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
- return -EAGAIN;
- }
- if (ret & VM_FAULT_ERROR)
- goto err;
-
- return -EBUSY;
+enum {
+ HMM_NEED_FAULT = 1 << 0,
+ HMM_NEED_WRITE_FAULT = 1 << 1,
+ HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
+};
-err:
- *pfn = range->values[HMM_PFN_ERROR];
- return -EFAULT;
+/*
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
+ * @range: range use to encode HMM pfn value
+ * @pfn: pfn value for which to create the device entry
+ * Return: valid device entry for the pfn
+ */
+static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+ unsigned long pfn)
+{
+ return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
}
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
@@ -79,56 +63,43 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
}
/*
- * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
* @addr: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
- * @fault: should we fault or not ?
- * @write_fault: write fault ?
+ * @required_fault: HMM_NEED_* flags
* @walk: mm_walk structure
- * Return: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
*/
-static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
- bool fault, bool write_fault,
- struct mm_walk *walk)
+static int hmm_vma_fault(unsigned long addr, unsigned long end,
+ unsigned int required_fault, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
- struct hmm_range *range = hmm_vma_walk->range;
- uint64_t *pfns = range->pfns;
- unsigned long i;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned int fault_flags = FAULT_FLAG_REMOTE;
+ WARN_ON_ONCE(!required_fault);
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
-
- if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- for (; addr < end; addr += PAGE_SIZE, i++) {
- pfns[i] = range->values[HMM_PFN_NONE];
- if (fault || write_fault) {
- int ret;
- ret = hmm_vma_do_fault(walk, addr, write_fault,
- &pfns[i]);
- if (ret != -EBUSY)
- return ret;
- }
+ if (required_fault & HMM_NEED_WRITE_FAULT) {
+ if (!(vma->vm_flags & VM_WRITE))
+ return -EPERM;
+ fault_flags |= FAULT_FLAG_WRITE;
}
- return (fault || write_fault) ? -EBUSY : 0;
+ for (; addr < end; addr += PAGE_SIZE)
+ if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ return -EFAULT;
+ return -EBUSY;
}
-static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- uint64_t pfns, uint64_t cpu_flags,
- bool *fault, bool *write_fault)
+static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ uint64_t pfns, uint64_t cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
- if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
- return;
-
/*
* So we not only consider the individual per page request we also
* consider the default flags requested for the range. The API can
@@ -143,46 +114,44 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
- return;
- /* If this is device memory then only fault if explicitly requested */
- if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
- /* Do we fault on device memory ? */
- if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
- *write_fault = pfns & range->flags[HMM_PFN_WRITE];
- *fault = true;
- }
- return;
- }
+ return 0;
- /* If CPU page table is not valid then we need to fault */
- *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
/* Need to write fault ? */
if ((pfns & range->flags[HMM_PFN_WRITE]) &&
- !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
- *write_fault = true;
- *fault = true;
- }
+ !(cpu_flags & range->flags[HMM_PFN_WRITE]))
+ return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
+
+ /* If CPU page table is not valid then we need to fault */
+ if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
+ return HMM_NEED_FAULT;
+ return 0;
}
-static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- const uint64_t *pfns, unsigned long npages,
- uint64_t cpu_flags, bool *fault,
- bool *write_fault)
+static unsigned int
+hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+ const uint64_t *pfns, unsigned long npages,
+ uint64_t cpu_flags)
{
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned int required_fault = 0;
unsigned long i;
- if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
- *fault = *write_fault = false;
- return;
- }
+ /*
+ * If the default flags do not request to fault pages, and the mask does
+ * not allow for individual pages to be faulted, then
+ * hmm_pte_need_fault() will always return 0.
+ */
+ if (!((range->default_flags | range->pfn_flags_mask) &
+ range->flags[HMM_PFN_VALID]))
+ return 0;
- *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
- hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
- fault, write_fault);
- if ((*write_fault))
- return;
+ required_fault |=
+ hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
+ if (required_fault == HMM_NEED_ALL_BITS)
+ return required_fault;
}
+ return required_fault;
}
static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
@@ -190,16 +159,23 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- bool fault, write_fault;
+ unsigned int required_fault;
unsigned long i, npages;
uint64_t *pfns;
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
pfns = &range->pfns[i];
- hmm_range_need_fault(hmm_vma_walk, pfns, npages,
- 0, &fault, &write_fault);
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+ required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+ if (!walk->vma) {
+ if (required_fault)
+ return -EFAULT;
+ return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
+ }
+ if (required_fault)
+ return hmm_vma_fault(addr, end, required_fault, walk);
+ hmm_vma_walk->last = addr;
+ return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
}
static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
@@ -218,31 +194,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
- bool fault, write_fault;
+ unsigned int required_fault;
uint64_t cpu_flags;
npages = (end - addr) >> PAGE_SHIFT;
cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
- hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
- &fault, &write_fault);
-
- if (pmd_protnone(pmd) || fault || write_fault)
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
+ if (required_fault)
+ return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
- if (pmd_devmap(pmd)) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
- return -EBUSY;
- }
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
- }
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
return 0;
}
@@ -252,6 +216,14 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, uint64_t *pfns, pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline bool hmm_is_device_private_entry(struct hmm_range *range,
+ swp_entry_t entry)
+{
+ return is_device_private_entry(entry) &&
+ device_private_entry_to_page(entry)->pgmap->owner ==
+ range->dev_private_owner;
+}
+
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
{
if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
@@ -267,102 +239,81 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- bool fault, write_fault;
+ unsigned int required_fault;
uint64_t cpu_flags;
pte_t pte = *ptep;
uint64_t orig_pfn = *pfn;
- *pfn = range->values[HMM_PFN_NONE];
- fault = write_fault = false;
-
if (pte_none(pte)) {
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
- &fault, &write_fault);
- if (fault || write_fault)
+ required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ if (required_fault)
goto fault;
+ *pfn = range->values[HMM_PFN_NONE];
return 0;
}
if (!pte_present(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (!non_swap_entry(entry)) {
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
- if (fault || write_fault)
- goto fault;
- return 0;
- }
-
/*
- * This is a special swap entry, ignore migration, use
- * device and report anything else as error.
+ * Never fault in device private pages pages, but just report
+ * the PFN even if not present.
*/
- if (is_device_private_entry(entry)) {
- cpu_flags = range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_DEVICE_PRIVATE];
- cpu_flags |= is_write_device_private_entry(entry) ?
- range->flags[HMM_PFN_WRITE] : 0;
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
- if (fault || write_fault)
- goto fault;
+ if (hmm_is_device_private_entry(range, entry)) {
*pfn = hmm_device_entry_from_pfn(range,
- swp_offset(entry));
- *pfn |= cpu_flags;
+ device_private_entry_to_pfn(entry));
+ *pfn |= range->flags[HMM_PFN_VALID];
+ if (is_write_device_private_entry(entry))
+ *pfn |= range->flags[HMM_PFN_WRITE];
return 0;
}
- if (is_migration_entry(entry)) {
- if (fault || write_fault) {
- pte_unmap(ptep);
- hmm_vma_walk->last = addr;
- migration_entry_wait(walk->mm, pmdp, addr);
- return -EBUSY;
- }
+ required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ if (!required_fault) {
+ *pfn = range->values[HMM_PFN_NONE];
return 0;
}
+ if (!non_swap_entry(entry))
+ goto fault;
+
+ if (is_migration_entry(entry)) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(walk->mm, pmdp, addr);
+ return -EBUSY;
+ }
+
/* Report error for everything else */
- *pfn = range->values[HMM_PFN_ERROR];
+ pte_unmap(ptep);
return -EFAULT;
- } else {
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
}
- if (fault || write_fault)
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ if (required_fault)
goto fault;
- if (pte_devmap(pte)) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
- return -EBUSY;
- } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
- if (!is_zero_pfn(pte_pfn(pte))) {
- *pfn = range->values[HMM_PFN_SPECIAL];
+ /*
+ * Since each architecture defines a struct page for the zero page, just
+ * fall through and treat it like a normal page.
+ */
+ if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
+ if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
+ pte_unmap(ptep);
return -EFAULT;
}
- /*
- * Since each architecture defines a struct page for the zero
- * page, just fall through and treat it like a normal page.
- */
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return 0;
}
*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+ return hmm_vma_fault(addr, end, required_fault, walk);
}
static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -372,8 +323,9 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- uint64_t *pfns = range->pfns;
- unsigned long addr = start, i;
+ uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
+ unsigned long addr = start;
pte_t *ptep;
pmd_t pmd;
@@ -383,24 +335,19 @@ again:
return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
- bool fault, write_fault;
- unsigned long npages;
- uint64_t *pfns;
-
- i = (addr - range->start) >> PAGE_SHIFT;
- npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
-
- hmm_range_need_fault(hmm_vma_walk, pfns, npages,
- 0, &fault, &write_fault);
- if (fault || write_fault) {
+ if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
- return 0;
- } else if (!pmd_present(pmd))
+ return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+ }
+
+ if (!pmd_present(pmd)) {
+ if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ }
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
@@ -417,8 +364,7 @@ again:
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
- i = (addr - range->start) >> PAGE_SHIFT;
- return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
+ return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
}
/*
@@ -427,31 +373,23 @@ again:
* entry pointing to pte directory or it is a bad pmd that will not
* recover.
*/
- if (pmd_bad(pmd))
+ if (pmd_bad(pmd)) {
+ if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ }
ptep = pte_offset_map(pmdp, addr);
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+ for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
int r;
- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
if (r) {
- /* hmm_vma_handle_pte() did unmap pte directory */
+ /* hmm_vma_handle_pte() did pte_unmap() */
hmm_vma_walk->last = addr;
return r;
}
}
- if (hmm_vma_walk->pgmap) {
- /*
- * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
- * so that we can leverage get_dev_pagemap() optimization which
- * will not re-take a reference on a pgmap if we already have
- * one.
- */
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
@@ -487,18 +425,18 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
pud = READ_ONCE(*pudp);
if (pud_none(pud)) {
- ret = hmm_vma_walk_hole(start, end, -1, walk);
- goto out_unlock;
+ spin_unlock(ptl);
+ return hmm_vma_walk_hole(start, end, -1, walk);
}
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
+ unsigned int required_fault;
uint64_t *pfns, cpu_flags;
- bool fault, write_fault;
if (!pud_present(pud)) {
- ret = hmm_vma_walk_hole(start, end, -1, walk);
- goto out_unlock;
+ spin_unlock(ptl);
+ return hmm_vma_walk_hole(start, end, -1, walk);
}
i = (addr - range->start) >> PAGE_SHIFT;
@@ -506,29 +444,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
pfns = &range->pfns[i];
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
- hmm_range_need_fault(hmm_vma_walk, pfns, npages,
- cpu_flags, &fault, &write_fault);
- if (fault || write_fault) {
- ret = hmm_vma_walk_hole_(addr, end, fault,
- write_fault, walk);
- goto out_unlock;
+ required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
+ npages, cpu_flags);
+ if (required_fault) {
+ spin_unlock(ptl);
+ return hmm_vma_fault(addr, end, required_fault, walk);
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap)) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ for (i = 0; i < npages; ++i, ++pfn)
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
- }
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
goto out_unlock;
}
@@ -554,24 +480,20 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
uint64_t orig_pfn, cpu_flags;
- bool fault, write_fault;
+ unsigned int required_fault;
spinlock_t *ptl;
pte_t entry;
- int ret = 0;
ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
entry = huge_ptep_get(pte);
i = (start - range->start) >> PAGE_SHIFT;
orig_pfn = range->pfns[i];
- range->pfns[i] = range->values[HMM_PFN_NONE];
cpu_flags = pte_to_hmm_pfn_flags(range, entry);
- fault = write_fault = false;
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
- if (fault || write_fault) {
- ret = -ENOENT;
- goto unlock;
+ required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ if (required_fault) {
+ spin_unlock(ptl);
+ return hmm_vma_fault(addr, end, required_fault, walk);
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
@@ -579,14 +501,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
hmm_vma_walk->last = end;
-
-unlock:
spin_unlock(ptl);
-
- if (ret == -ENOENT)
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
-
- return ret;
+ return 0;
}
#else
#define hmm_vma_walk_hugetlb_entry NULL
@@ -599,40 +515,32 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- /*
- * Skip vma ranges that don't have struct page backing them or
- * map I/O devices directly.
- */
- if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
- return -EFAULT;
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ vma->vm_flags & VM_READ)
+ return 0;
/*
+ * vma ranges that don't have struct page backing them or map I/O
+ * devices directly cannot be handled by hmm_range_fault().
+ *
* If the vma does not allow read access, then assume that it does not
- * allow write access either. HMM does not support architectures
- * that allow write without read.
+ * allow write access either. HMM does not support architectures that
+ * allow write without read.
+ *
+ * If a fault is requested for an unsupported range then it is a hard
+ * failure.
*/
- if (!(vma->vm_flags & VM_READ)) {
- bool fault, write_fault;
-
- /*
- * Check to see if a fault is requested for any page in the
- * range.
- */
- hmm_range_need_fault(hmm_vma_walk, range->pfns +
- ((start - range->start) >> PAGE_SHIFT),
- (end - start) >> PAGE_SHIFT,
- 0, &fault, &write_fault);
- if (fault || write_fault)
- return -EFAULT;
-
- hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
- hmm_vma_walk->last = end;
+ if (hmm_range_need_fault(hmm_vma_walk,
+ range->pfns +
+ ((start - range->start) >> PAGE_SHIFT),
+ (end - start) >> PAGE_SHIFT, 0))
+ return -EFAULT;
- /* Skip this vma and continue processing the next vma. */
- return 1;
- }
+ hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+ hmm_vma_walk->last = end;
- return 0;
+ /* Skip this vma and continue processing the next vma. */
+ return 1;
}
static const struct mm_walk_ops hmm_walk_ops = {
@@ -645,8 +553,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
/**
* hmm_range_fault - try to fault some address in a virtual address range
- * @range: range being faulted
- * @flags: HMM_FAULT_* flags
+ * @range: argument structure
*
* Return: the number of valid pages in range->pfns[] (from range start
* address), which may be zero. On error one of the following status codes
@@ -657,26 +564,19 @@ static const struct mm_walk_ops hmm_walk_ops = {
* -ENOMEM: Out of memory.
* -EPERM: Invalid permission (e.g., asking for write and range is read
* only).
- * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped.
* -EBUSY: The range has been invalidated and the caller needs to wait for
* the invalidation to finish.
- * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
- * that range) number of valid pages in range->pfns[] (from
- * range start address).
- *
- * This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs
- * and caller does not ask for migration.
+ * -EFAULT: A page was requested to be valid and could not be made valid
+ * ie it has no backing VMA or it is illegal to access
*
- * On error, for one virtual address in the range, the function will mark the
- * corresponding HMM pfn entry with an error flag.
+ * This is similar to get_user_pages(), except that it can read the page tables
+ * without mutating them (ie causing faults).
*/
-long hmm_range_fault(struct hmm_range *range, unsigned int flags)
+long hmm_range_fault(struct hmm_range *range)
{
struct hmm_vma_walk hmm_vma_walk = {
.range = range,
.last = range->start,
- .flags = flags,
};
struct mm_struct *mm = range->notifier->mm;
int ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b08b199f9a11..24ad53b4dfc0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3043,8 +3043,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = *pvmw->pmd;
- pmdp_invalidate(vma, address, pvmw->pmd);
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
entry = make_migration_entry(page, pmd_write(pmdval));
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index e434b05416c6..5280bcf459af 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -240,8 +240,7 @@ again:
if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
&counter)) {
ret = -ENOMEM;
- hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx,
- HUGETLB_MAX);
+ hugetlb_event(h_cg, idx, HUGETLB_MAX);
}
css_put(&h_cg->css);
done:
diff --git a/mm/madvise.c b/mm/madvise.c
index 43b47d3fae02..4bb30ed6c8d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
}
page = pmd_page(orig_pmd);
+
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- if (page_mapcount(page) != 1)
- goto huge_unlock;
-
get_page(page);
spin_unlock(ptl);
lock_page(page);
@@ -426,6 +428,10 @@ regular_page:
continue;
}
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6dc8712e39..7ddf91c4295f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -409,8 +409,10 @@ int memcg_expand_shrinker_maps(int new_id)
if (mem_cgroup_is_root(memcg))
continue;
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret)
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
goto unlock;
+ }
}
unlock:
if (!ret)
@@ -775,6 +777,17 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
+void mod_memcg_obj_state(void *p, int idx, int val)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
@@ -2295,28 +2308,41 @@ static void high_work_func(struct work_struct *work)
#define MEMCG_DELAY_SCALING_SHIFT 14
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
*/
-void mem_cgroup_handle_over_high(void)
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
{
- unsigned long usage, high, clamped_high;
- unsigned long pflags;
- unsigned long penalty_jiffies, overage;
- unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg;
+ unsigned long penalty_jiffies;
+ u64 max_overage = 0;
- if (likely(!nr_pages))
- return;
+ do {
+ unsigned long usage, high;
+ u64 overage;
- memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
- current->memcg_nr_pages_over_high = 0;
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
+
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ overage = div64_u64(overage, high);
+
+ if (overage > max_overage)
+ max_overage = overage;
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ if (!max_overage)
+ return 0;
/*
- * memory.high is breached and reclaim is unable to keep up. Throttle
- * allocators proactively to slow down excessive growth.
- *
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
@@ -2324,24 +2350,9 @@ void mem_cgroup_handle_over_high(void)
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
-
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
-
- if (usage <= high)
- goto out;
-
- /*
- * Prevent division by 0 in overage calculation by acting as if it was a
- * threshold of 1 page
- */
- clamped_high = max(high, 1UL);
-
- overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
- clamped_high);
-
- penalty_jiffies = ((u64)overage * overage * HZ)
- >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+ penalty_jiffies = max_overage * max_overage * HZ;
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
/*
* Factor in the task's own contribution to the overage, such that four
@@ -2358,7 +2369,32 @@ void mem_cgroup_handle_over_high(void)
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
- penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned long penalty_jiffies;
+ unsigned long pflags;
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg;
+
+ if (likely(!nr_pages))
+ return;
+
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ reclaim_high(memcg, nr_pages, GFP_KERNEL);
+ current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ */
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2636,6 +2672,33 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct page *page;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ page = virt_to_head_page(p);
+
+ /*
+ * Slab pages don't have page->mem_cgroup set because corresponding
+ * kmem caches can be reparented during the lifetime. That's why
+ * memcg_from_slab_page() should be used instead.
+ */
+ if (PageSlab(page))
+ return memcg_from_slab_page(page);
+
+ /* All other pages use page->mem_cgroup */
+ return page->mem_cgroup;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -4025,7 +4088,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
- int i, j, size;
+ int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
@@ -4045,14 +4108,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
- size = 0;
+ size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
+ else
+ entries++;
}
new = thresholds->spare;
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
@@ -6680,19 +6749,9 @@ void mem_cgroup_sk_alloc(struct sock *sk)
if (!mem_cgroup_sockets_enabled)
return;
- /*
- * Socket cloning can throw us here with sk_memcg already
- * filled. It won't however, necessarily happen from
- * process context. So the test for root memcg given
- * the current task's memcg won't help us in this case.
- *
- * Respecting the original socket's memcg is a better
- * decision in this case.
- */
- if (sk->sk_memcg) {
- css_get(&sk->sk_memcg->css);
+ /* Do not associate the sock with unrelated interrupted task's memcg. */
+ if (in_interrupt())
return;
- }
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
diff --git a/mm/memory.c b/mm/memory.c
index 0bccc622e482..e8bfdf0d9d1d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2257,7 +2257,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
bool ret;
void *kaddr;
void __user *uaddr;
- bool force_mkyoung;
+ bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
@@ -2282,11 +2282,11 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte);
- if (force_mkyoung) {
+ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
@@ -2310,18 +2310,37 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ if (locked)
+ goto warn;
+
+ /* Re-validate under PTL if the page is still mapped */
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /* The PTE changed under us. Retry page fault. */
+ ret = false;
+ goto pte_unlock;
+ }
+
/*
- * Give a warn in case there can be some obscure
- * use-case
+ * The same page can be mapped back since last copy attampt.
+ * Try to copy again under PTL.
*/
- WARN_ON_ONCE(1);
- clear_page(kaddr);
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ /*
+ * Give a warn in case there can be some obscure
+ * use-case
+ */
+warn:
+ WARN_ON_ONCE(1);
+ clear_page(kaddr);
+ }
}
ret = true;
pte_unlock:
- if (force_mkyoung)
+ if (locked)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0a54ffac8c68..19389cdc16a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -574,7 +574,13 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
void generic_online_page(struct page *page, unsigned int order)
{
- kernel_map_pages(page, 1 << order, 1);
+ /*
+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
+ * so we should map it first. This is better than introducing a special
+ * case in page freeing fast path.
+ */
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
diff --git a/mm/memremap.c b/mm/memremap.c
index 09b5b7adc773..9b2c97ceb775 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
break;
case MEMORY_DEVICE_FS_DAX:
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
diff --git a/mm/migrate.c b/mm/migrate.c
index b1092876e537..7605d2c23433 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2241,7 +2241,7 @@ again:
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
- unsigned long mpfn, pfn;
+ unsigned long mpfn = 0, pfn;
struct page *page;
swp_entry_t entry;
pte_t pte;
@@ -2255,8 +2255,6 @@ again:
}
if (!pte_present(pte)) {
- mpfn = 0;
-
/*
* Only care about unaddressable device page special
* page table entry. Other special swap entries are not
@@ -2267,11 +2265,16 @@ again:
goto next;
page = device_private_entry_to_page(entry);
+ if (page->pgmap->owner != migrate->src_owner)
+ goto next;
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ if (migrate->src_owner)
+ goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
diff --git a/mm/mmap.c b/mm/mmap.c
index 6756b8bb0033..d681a20eb4ea 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -195,8 +195,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool downgraded = false;
LIST_HEAD(uf);
- brk = untagged_addr(brk);
-
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -1557,8 +1555,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
struct file *file = NULL;
unsigned long retval;
- addr = untagged_addr(addr);
-
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index ef3973a5d34a..06852b896fa6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
* ->release returns.
*/
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu))
/*
* If ->release runs before mmu_notifier_unregister it must be
* handled, as it's the only way for the driver to flush all
@@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_flush_young)
young |= subscription->ops->clear_flush_young(
subscription, mm, start, end);
@@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_young)
young |= subscription->ops->clear_young(subscription,
mm, start, end);
@@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->test_young) {
young = subscription->ops->test_young(subscription, mm,
address);
@@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->change_pte)
subscription->ops->change_pte(subscription, mm, address,
pte);
@@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start(
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
const struct mmu_notifier_ops *ops = subscription->ops;
if (ops->invalidate_range_start) {
@@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
@@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->invalidate_range)
subscription->ops->invalidate_range(subscription, mm,
start, end);
@@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
spin_lock(&mm->notifier_subscriptions->lock);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ lockdep_is_held(&mm->notifier_subscriptions->lock)) {
if (subscription->ops != ops)
continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7a8e84f86831..311c0dadf71c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -161,6 +161,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return pages;
}
+/*
+ * Used when setting automatic NUMA hinting protection where it is
+ * critical that a numa hinting PMD is not confused with a bad PMD.
+ */
+static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
+
+ if (pmd_none(pmdval))
+ return 1;
+ if (pmd_trans_huge(pmdval))
+ return 0;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -178,8 +203,17 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
- && pmd_none_or_clear_bad(pmd))
+
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_sem
+ * held for read. It's possible a parallel update to occur
+ * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+ * check leading to a false positive and clearing.
+ * Hence, it's necessary to atomically read the PMD value
+ * for all the checks.
+ */
+ if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+ pmd_none_or_clear_bad_unless_trans_huge(pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..d28f08a36b96 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -606,8 +606,17 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
+ /*
+ * There is a deliberate asymmetry here: we strip the pointer tag
+ * from the old address but leave the new address alone. This is
+ * for consistency with mmap(), where we prevent the creation of
+ * aliasing mappings in userspace by leaving the tag bits of the
+ * mapping address intact. A non-zero tag will cause the subsequent
+ * range checks to reject the address as invalid.
+ *
+ * See Documentation/arm64/tagged-address-abi.rst for more information.
+ */
addr = untagged_addr(addr);
- new_addr = untagged_addr(new_addr);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
diff --git a/mm/nommu.c b/mm/nommu.c
index bd2b4e5ef144..318df4e236c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -370,10 +370,14 @@ void vm_unmap_aliases(void)
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
+ * chose not to have one.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
+{
+}
+
+void __weak vmalloc_sync_unmappings(void)
{
}
diff --git a/mm/shmem.c b/mm/shmem.c
index c8f7540ef048..aad3ba74b0e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3386,8 +3386,6 @@ static const struct constant_table shmem_param_enums_huge[] = {
{"always", SHMEM_HUGE_ALWAYS },
{"within_size", SHMEM_HUGE_WITHIN_SIZE },
{"advise", SHMEM_HUGE_ADVISE },
- {"deny", SHMEM_HUGE_DENY },
- {"force", SHMEM_HUGE_FORCE },
{}
};
diff --git a/mm/slub.c b/mm/slub.c
index 17dc00e33115..6589b41d5a60 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1973,8 +1973,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- else if (!node_present_pages(node))
- searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -2563,17 +2561,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
struct page *page;
page = c->page;
- if (!page)
+ if (!page) {
+ /*
+ * if the node is not online or has no normal memory, just
+ * ignore the node constraint
+ */
+ if (unlikely(node != NUMA_NO_NODE &&
+ !node_state(node, N_NORMAL_MEMORY)))
+ node = NUMA_NO_NODE;
goto new_slab;
+ }
redo:
if (unlikely(!node_match(page, node))) {
- int searchnode = node;
-
- if (node != NUMA_NO_NODE && !node_present_pages(node))
- searchnode = node_to_mem_node(node);
-
- if (unlikely(!node_match(page, searchnode))) {
+ /*
+ * same as above but node_match() being false already
+ * implies node != NUMA_NO_NODE
+ */
+ if (!node_state(node, N_NORMAL_MEMORY)) {
+ node = NUMA_NO_NODE;
+ goto redo;
+ } else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
@@ -2997,11 +3005,13 @@ redo:
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, tail_obj, c->freelist);
+ void **freelist = READ_ONCE(c->freelist);
+
+ set_freepointer(s, tail_obj, freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
- c->freelist, tid,
+ freelist, tid,
head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
@@ -3175,6 +3185,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
if (unlikely(!object)) {
/*
+ * We may have removed an object from c->freelist using
+ * the fastpath in the previous iteration; in that case,
+ * c->tid has not been bumped yet.
+ * Since ___slab_alloc() may reenable interrupts while
+ * allocating memory, we should bump c->tid now.
+ */
+ c->tid = next_tid(c->tid);
+
+ /*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
diff --git a/mm/sparse.c b/mm/sparse.c
index c184b69460b7..65599e8bd636 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -734,6 +734,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
+ bool empty;
unsigned long *subsection_map = ms->usage
? &ms->usage->subsection_map[0] : NULL;
@@ -764,7 +765,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
*/
bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
- if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+ if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
/*
@@ -779,13 +781,21 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- ms->section_mem_map = (unsigned long)NULL;
+ /*
+ * Mark the section invalid so that valid_section()
+ * return false. This prevents code from dereferencing
+ * ms->usage array.
+ */
+ ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
}
if (section_is_early && memmap)
free_map_bootmem(memmap);
else
depopulate_section_memmap(pfn, nr_pages, altmap);
+
+ if (empty)
+ ms->section_mem_map = (unsigned long)NULL;
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
@@ -876,7 +886,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2c33ff456ed5..be33e6176cd9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2899,10 +2899,6 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
p->bdev = inode->i_sb->s_bdev;
}
- inode_lock(inode);
- if (IS_SWAPFILE(inode))
- return -EBUSY;
-
return 0;
}
@@ -3157,36 +3153,41 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mapping = swap_file->f_mapping;
inode = mapping->host;
- /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
error = claim_swapfile(p, inode);
if (unlikely(error))
goto bad_swap;
+ inode_lock(inode);
+ if (IS_SWAPFILE(inode)) {
+ error = -EBUSY;
+ goto bad_swap_unlock_inode;
+ }
+
/*
* Read the swap header.
*/
if (!mapping->a_ops->readpage) {
error = -EINVAL;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
page = read_mapping_page(mapping, 0, swap_file);
if (IS_ERR(page)) {
error = PTR_ERR(page);
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
swap_header = kmap(page);
maxpages = read_swap_header(p, swap_header, inode);
if (unlikely(!maxpages)) {
error = -EINVAL;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
@@ -3211,7 +3212,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
GFP_KERNEL);
if (!cluster_info) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
for (ci = 0; ci < nr_cluster; ci++)
@@ -3220,7 +3221,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->percpu_cluster = alloc_percpu(struct percpu_cluster);
if (!p->percpu_cluster) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
@@ -3234,13 +3235,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
- goto bad_swap;
+ goto bad_swap_unlock_inode;
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
cluster_info, maxpages, &span);
if (unlikely(nr_extents < 0)) {
error = nr_extents;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (IS_ENABLED(CONFIG_FRONTSWAP))
@@ -3280,7 +3281,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = init_swap_address_space(p->type, maxpages);
if (error)
- goto bad_swap;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
@@ -3290,7 +3291,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
mutex_lock(&swapon_mutex);
@@ -3315,6 +3316,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+bad_swap_unlock_inode:
+ inode_unlock(inode);
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -3322,6 +3325,7 @@ bad_swap:
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
+ inode = NULL;
destroy_swap_extents(p);
swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
@@ -3333,13 +3337,8 @@ bad_swap:
kvfree(frontswap_map);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
- if (swap_file) {
- if (inode) {
- inode_unlock(inode);
- inode = NULL;
- }
+ if (swap_file)
filp_close(swap_file, NULL);
- }
out:
if (page && !IS_ERR(page)) {
kunmap(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1f46c3b86f9f..6b8eeb0ecee5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
- vmalloc_sync_all();
+ vmalloc_sync_unmappings();
/*
* TODO: to calculate a flush range without looping.
@@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
EXPORT_SYMBOL(remap_vmalloc_range);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
+ * not to have one.
*
* The purpose of this function is to make sure the vmalloc area
* mappings are identical in all page-tables in the system.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
{
}
+void __weak vmalloc_sync_unmappings(void)
+{
+}
static int f(pte_t *pte, unsigned long addr, void *data)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c05eb9efec07..876370565455 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2415,10 +2415,13 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
- * Make sure we don't miss the last page
- * because of a round-off error.
+ * Make sure we don't miss the last page on
+ * the offlined memory cgroups because of a
+ * round-off error.
*/
- scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ scan = mem_cgroup_online(memcg) ?
+ div64_u64(scan * fraction[file], denominator) :
+ DIV64_U64_ROUND_UP(scan * fraction[file],
denominator);
break;
case SCAN_FILE:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 43754d8ebce8..42f31c4b53ad 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -41,7 +41,6 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/zpool.h>
#include <linux/magic.h>