diff options
Diffstat (limited to 'mm')
45 files changed, 1793 insertions, 1037 deletions
diff --git a/mm/Makefile b/mm/Makefile index 433eaf9a876e..aa0aa17cb413 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + mlock.o mmap.o mprotect.o mremap.o msync.o \ + page_vma_mapped.o pagewalk.o pgtable-generic.o \ + rmap.o vmalloc.o + ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o @@ -348,6 +348,32 @@ err: return ret; } +#ifdef CONFIG_CMA_DEBUG +static void cma_debug_show_areas(struct cma *cma) +{ + unsigned long next_zero_bit, next_set_bit; + unsigned long start = 0; + unsigned int nr_zero, nr_total = 0; + + mutex_lock(&cma->lock); + pr_info("number of available pages: "); + for (;;) { + next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); + if (next_zero_bit >= cma->count) + break; + next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + nr_zero = next_set_bit - next_zero_bit; + pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); + nr_total += nr_zero; + start = next_zero_bit + nr_zero; + } + pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + mutex_unlock(&cma->lock); +} +#else +static inline void cma_debug_show_areas(struct cma *cma) { } +#endif + /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. @@ -357,14 +383,15 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) +struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, + gfp_t gfp_mask) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; - int ret; + int ret = -ENOMEM; if (!cma || !cma->count) return NULL; @@ -402,7 +429,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, + gfp_mask); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -421,6 +449,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) trace_cma_alloc(pfn, page, count, align); + if (ret) { + pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, count, ret); + cma_debug_show_areas(cma); + } + pr_debug("%s(): returned %p\n", __func__, page); return page; } diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f8e4b60db167..ffc0c3d0ae64 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0); + p = cma_alloc(cma, count, 0, GFP_KERNEL); if (!p) { kfree(mem); return -ENOMEM; diff --git a/mm/compaction.c b/mm/compaction.c index 0aa2757399ee..0fdfde016ee2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -802,7 +802,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = false; } - if (isolate_movable_page(page, isolate_mode)) + if (!isolate_movable_page(page, isolate_mode)) goto isolate_success; } diff --git a/mm/dmapool.c b/mm/dmapool.c index abcbfe86c25a..cef82b8a9291 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -434,11 +434,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", - pool->name, vaddr, (unsigned long long)dma); + "dma_pool_free %s, %p (bad vaddr)/%pad\n", + pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", - pool->name, vaddr, (unsigned long long)dma); + pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n", + pool->name, vaddr, &dma); return; } { @@ -450,11 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n", + pool->name, &dma); else - pr_err("dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %pad already free\n", + pool->name, &dma); return; } } diff --git a/mm/filemap.c b/mm/filemap.c index 416d563468a3..1944c631e3e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1008,9 +1008,12 @@ void page_endio(struct page *page, bool is_write, int err) unlock_page(page); } else { if (err) { + struct address_space *mapping; + SetPageError(page); - if (page->mapping) - mapping_set_error(page->mapping, err); + mapping = page_mapping(page); + if (mapping) + mapping_set_error(mapping, err); } end_page_writeback(page); } @@ -2169,7 +2172,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, /** * filemap_fault - read in file data for page fault handling - * @vma: vma in which the fault was taken * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a @@ -2191,10 +2193,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { int error; - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2216,12 +2218,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vma, ra, file, page, offset); + do_async_mmap_readahead(vmf->vma, ra, file, page, offset); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vma, ra, file, offset); + do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -2229,7 +2231,7 @@ retry_find: goto no_cached_page; } - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { put_page(page); return ret | VM_FAULT_RETRY; } @@ -2396,14 +2398,14 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping) { unlock_page(page); @@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); @@ -265,8 +272,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - return no_page_table(vma, flags); if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -277,6 +282,9 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + return no_page_table(vma, flags); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9ecc2aeadfc..71e3dede95b4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pud = pud_mkwrite(pud); + return pud; +} + +static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t entry; + spinlock_t *ptl; + + ptl = pud_lock(mm, pud); + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pud_mkdevmap(entry); + if (write) { + entry = pud_mkyoung(pud_mkdirty(entry)); + entry = maybe_pud_mkwrite(entry, vma); + } + set_pud_at(mm, addr, pud, entry); + update_mmu_cache_pud(vma, addr, pud); + spin_unlock(ptl); +} + +int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pud_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON(!pfn_t_devmap(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + + insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -887,6 +941,123 @@ out: return ret; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud) +{ + pud_t _pud; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pud is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pud to + * set the young bit, instead of the current set_pud_at. + */ + _pud = pud_mkyoung(pud_mkdirty(*pud)); + if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, + pud, _pud, 1)) + update_mmu_cache_pud(vma, addr, pud); +} + +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags) +{ + unsigned long pfn = pud_pfn(*pud); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pud_lockptr(mm, pud)); + + if (flags & FOLL_WRITE && !pud_write(*pud)) + return NULL; + + if (pud_present(*pud) && pud_devmap(*pud)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pud); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma) +{ + spinlock_t *dst_ptl, *src_ptl; + pud_t pud; + int ret; + + dst_ptl = pud_lock(dst_mm, dst_pud); + src_ptl = pud_lockptr(src_mm, src_pud); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pud = *src_pud; + if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + goto out_unlock; + + /* + * When page table lock is held, the huge zero pud should not be + * under splitting since we don't split the page itself, only pud to + * a page table. + */ + if (is_huge_zero_pud(pud)) { + /* No huge zero pud yet */ + } + + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_mkold(pud_wrprotect(pud)); + set_pud_at(dst_mm, addr, dst_pud, pud); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + return ret; +} + +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ + pud_t entry; + unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); + if (unlikely(!pud_same(*vmf->pud, orig_pud))) + goto unlock; + + entry = pud_mkyoung(orig_pud); + if (write) + entry = pud_mkdirty(entry); + haddr = vmf->address & HPAGE_PUD_MASK; + if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) + update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); + +unlock: + spin_unlock(vmf->ptl); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; @@ -1255,7 +1426,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!pmd_write(pmd)) + if (!pmd_savedwrite(pmd)) flags |= TNF_NO_GROUP; /* @@ -1318,7 +1489,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - was_writable = pmd_write(pmd); + was_writable = pmd_savedwrite(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (was_writable) @@ -1335,7 +1506,7 @@ out: if (page_nid != -1) task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, - vmf->flags); + flags); return 0; } @@ -1573,7 +1744,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) - entry = pmd_mkwrite(entry); + entry = pmd_mk_savedwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && @@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return NULL; } +/* + * Returns true if a given pud maps a thp, false otherwise. + * + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. + */ +spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + + ptl = pud_lock(vma->vm_mm, pud); + if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pud, unsigned long addr) +{ + pud_t orig_pud; + spinlock_t *ptl; + + ptl = __pud_trans_huge_lock(pud, vma); + if (!ptl) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pudp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pudp related + * operations. + */ + orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, + tlb->fullmm); + tlb_remove_pud_tlb_entry(tlb, pud, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + /* No zero page support yet */ + } else { + /* No support for anonymous PUD pages yet */ + BUG(); + } + return 1; +} + +static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, + unsigned long haddr) +{ + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); + VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + + count_vm_event(THP_SPLIT_PMD); + + pudp_huge_clear_flush_notify(vma, haddr, pud); +} + +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PUD_MASK; + + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); + ptl = pud_lock(mm, pud); + if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + goto out; + __split_huge_pud_locked(vma, pud, haddr); + +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { @@ -1857,32 +2106,27 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | - TTU_RMAP_LOCKED; - int i, ret; + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; + int ret; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - /* We only need TTU_SPLIT_HUGE_PMD once */ - ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); - for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { - /* Cut short if the page is unmapped */ - if (page_count(page) == 1) - return; - - ret = try_to_unmap(page + i, ttu_flags); - } - VM_BUG_ON_PAGE(ret, page + i - 1); + ret = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(ret, page); } static void unfreeze_page(struct page *page) { int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) - remove_migration_ptes(page + i, page + i, true); + if (PageTransHuge(page)) { + remove_migration_ptes(page, page, true); + } else { + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); + } } static void __split_huge_page_tail(struct page *head, int tail, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 30e7709a5121..2e0e8159ce8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + GFP_KERNEL); } static bool pfn_range_valid_gigantic(struct zone *z, @@ -3142,7 +3143,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/internal.h b/mm/internal.h index 8ab72f4374e0..ccfc2a2969f4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -335,12 +335,15 @@ __vma_address(struct page *page, struct vm_area_struct *vma) static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { - unsigned long address = __vma_address(page, vma); + unsigned long start, end; + + start = __vma_address(page, vma); + end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - return address; + return max(start, vma->vm_start); } #else /* !CONFIG_MMU */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index b2a0cff2bb35..25f0e6521f36 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -435,7 +435,7 @@ void kasan_cache_shrink(struct kmem_cache *cache) quarantine_remove_cache(cache); } -void kasan_cache_destroy(struct kmem_cache *cache) +void kasan_cache_shutdown(struct kmem_cache *cache) { quarantine_remove_cache(cache); } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index dae929c02bbb..6f1ed1630873 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -274,6 +274,7 @@ static void per_cpu_remove_cache(void *arg) qlist_free_all(&to_free, cache); } +/* Free all quarantined objects belonging to cache. */ void quarantine_remove_cache(struct kmem_cache *cache) { unsigned long flags, i; @@ -223,6 +223,12 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +/* Checksum of an empty (zeroed) page */ +static unsigned int zero_checksum __read_mostly; + +/* Whether to merge empty (zeroed) pages with actual zero pages */ +static bool ksm_use_zero_pages __read_mostly; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -850,33 +856,36 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr; - pte_t *ptep; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + }; int swapped; int err = -EFAULT; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) + pvmw.address = page_address_in_vma(page, vma); + if (pvmw.address == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; + mmun_start = pvmw.address; + mmun_end = pvmw.address + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!page_vma_mapped_walk(&pvmw)) goto out_mn; + if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) + goto out_unlock; - if (pte_write(*ptep) || pte_dirty(*ptep)) { + if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { pte_t entry; swapped = PageSwapCache(page); - flush_cache_page(vma, addr, page_to_pfn(page)); + flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make @@ -886,25 +895,29 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. */ - entry = ptep_clear_flush_notify(vma, addr, ptep); + entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at(mm, addr, ptep, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) set_page_dirty(page); - entry = pte_mkclean(pte_wrprotect(entry)); - set_pte_at_notify(mm, addr, ptep, entry); + + if (pte_protnone(entry)) + entry = pte_mkclean(pte_clear_savedwrite(entry)); + else + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *ptep; + *orig_pte = *pvmw.pte; err = 0; out_unlock: - pte_unmap_unlock(ptep, ptl); + page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: @@ -926,6 +939,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; pte_t *ptep; + pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; @@ -950,12 +964,22 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out_mn; } - get_page(kpage); - page_add_anon_rmap(kpage, vma, addr, false); + /* + * No need to check ksm_use_zero_pages here: we can only have a + * zero_page here if ksm_use_zero_pages was enabled alreaady. + */ + if (!is_zero_pfn(page_to_pfn(kpage))) { + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr, false); + newpte = mk_pte(kpage, vma->vm_page_prot); + } else { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), + vma->vm_page_prot)); + } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); if (!page_mapped(page)) @@ -1467,6 +1491,23 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) return; } + /* + * Same checksum as an empty page. We attempt to merge it with the + * appropriate zero page if the user enabled this via sysfs. + */ + if (ksm_use_zero_pages && (checksum == zero_checksum)) { + struct vm_area_struct *vma; + + vma = find_mergeable_vma(rmap_item->mm, rmap_item->address); + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + /* + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. + */ + if (!err) + return; + } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { @@ -2233,6 +2274,28 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, KSM_ATTR(merge_across_nodes); #endif +static ssize_t use_zero_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_use_zero_pages); +} +static ssize_t use_zero_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return -EINVAL; + + ksm_use_zero_pages = value; + + return count; +} +KSM_ATTR(use_zero_pages); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2290,6 +2353,7 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &use_zero_pages_attr.attr, NULL, }; @@ -2304,6 +2368,11 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; + /* The correct value depends on page size and endianness */ + zero_checksum = calc_checksum(ZERO_PAGE(0)); + /* Default to false for backwards compatibility */ + ksm_use_zero_pages = false; + err = ksm_slab_init(); if (err) goto out; diff --git a/mm/madvise.c b/mm/madvise.c index b530a4986035..dc5927c812d3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -21,6 +21,7 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> @@ -92,14 +93,28 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; } @@ -120,15 +135,37 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; goto out; + } + error = __split_vma(mm, vma, start, 1); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + goto out; + } } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; + goto out; + } + error = __split_vma(mm, vma, end, 0); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } } success: @@ -136,10 +173,7 @@ success: * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; - out: - if (error == -ENOMEM) - error = -EAGAIN; return error; } @@ -479,7 +513,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (!can_madv_dontneed_vma(vma)) return -EINVAL; - madvise_userfault_dontneed(vma, prev, start, end); + userfaultfd_remove(vma, prev, start, end); zap_page_range(vma, start, end - start); return 0; } @@ -520,6 +554,7 @@ static long madvise_remove(struct vm_area_struct *vma, * mmap_sem. */ get_file(f); + userfaultfd_remove(vma, prev, start, end); up_read(¤t->mm->mmap_sem); error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, diff --git a/mm/memblock.c b/mm/memblock.c index c004f52be419..b64b47803e52 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .reserved.name = "reserved", #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP .physmem.regions = memblock_physmem_init_regions, .physmem.cnt = 1, /* empty dummy entry */ .physmem.max = INIT_PHYSMEM_REGIONS, + .physmem.name = "physmem", #endif .bottom_up = false, @@ -64,18 +67,6 @@ ulong __init_memblock choose_memblock_flags(void) return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } -/* inline so we don't get a warning when pr_debug is compiled out */ -static __init_memblock const char * -memblock_type_name(struct memblock_type *type) -{ - if (type == &memblock.memory) - return "memory"; - else if (type == &memblock.reserved) - return "reserved"; - else - return "unknown"; -} - /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { @@ -402,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", - memblock_type_name(type), type->max, type->max * 2); + type->name, type->max, type->max * 2); return -1; } memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - memblock_type_name(type), type->max * 2, (u64)addr, + type->name, type->max * 2, (u64)addr, (u64)addr + new_size - 1); /* @@ -1693,14 +1684,14 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) return memblock.current_limit; } -static void __init_memblock memblock_dump(struct memblock_type *type, char *name) +static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; unsigned long flags; int idx; struct memblock_region *rgn; - pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); + pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; @@ -1715,7 +1706,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", - name, idx, &base, &end, &size, nid_buf, flags); + type->name, idx, &base, &end, &size, nid_buf, flags); } } @@ -1726,8 +1717,11 @@ void __init_memblock __memblock_dump_all(void) &memblock.memory.total_size, &memblock.reserved.total_size); - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + memblock_dump(&memblock.memory); + memblock_dump(&memblock.reserved); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + memblock_dump(&memblock.physmem); +#endif } void __init memblock_allow_resize(void) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1fd6affcdde7..45867e439d31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,7 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/smp.h> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f283c7e0a2a3..3d0f2fd4bf73 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1527,7 +1527,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) { int ret = __get_any_page(page, pfn, flags); - if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + if (ret == 1 && !PageHuge(page) && + !PageLRU(page) && !__PageMovable(page)) { /* * Try to free it. */ @@ -1649,7 +1650,10 @@ static int __soft_offline_page(struct page *page, int flags) * Try to migrate to a new page instead. migrate.c * handles a large number of cases for us. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); /* * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. @@ -1657,18 +1661,20 @@ static int __soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + /* + * After isolated lru page, the PageLRU will be cleared, + * so use !__PageMovable instead for LRU page's mapping + * cannot have PAGE_MAPPING_MOVABLE. + */ + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - if (!list_empty(&pagelist)) { - list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory.c b/mm/memory.c index 7663068a33c6..14fc0b40f0bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -30,7 +30,7 @@ /* * 05.04.94 - Multi-page memory management added for v1.1. - * Idea by Alex Bligh (alex@cconcepts.co.uk) + * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) @@ -82,9 +82,9 @@ #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; -struct page *mem_map; - EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif @@ -95,8 +95,7 @@ EXPORT_SYMBOL(mem_map); * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL * and ZONE_HIGHMEM. */ -void * high_memory; - +void *high_memory; EXPORT_SYMBOL(high_memory); /* @@ -120,10 +119,10 @@ static int __init disable_randmaps(char *s) __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; - EXPORT_SYMBOL(zero_pfn); +unsigned long highest_memmap_pfn __read_mostly; + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -556,7 +555,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); + floor, next ? next->vm_start : ceiling); } else { /* * Optimization: gather nearby vmas into one call down @@ -569,7 +568,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); + floor, next ? next->vm_start : ceiling); } vma = next; } @@ -1001,7 +1000,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1032,6 +1031,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src src_pud = pud_offset(src_pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + int err; + + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + err = copy_huge_pud(dst_mm, src_mm, + dst_pud, src_pud, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, @@ -1129,9 +1140,8 @@ again: arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; - if (pte_none(ptent)) { + if (pte_none(ptent)) continue; - } if (pte_present(ptent)) { struct page *page; @@ -1263,9 +1273,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (next - addr != HPAGE_PUD_SIZE) { + VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); + split_huge_pud(vma, pud, addr); + } else if (zap_huge_pud(tlb, vma, pud, addr)) + goto next; + /* fall through */ + } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); +next: + cond_resched(); } while (pud++, addr = next, addr != end); return addr; @@ -1441,10 +1461,10 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { - pgd_t * pgd = pgd_offset(mm, addr); - pud_t * pud = pud_alloc(mm, pgd, addr); + pgd_t *pgd = pgd_offset(mm, addr); + pud_t *pud = pud_alloc(mm, pgd, addr); if (pud) { - pmd_t * pmd = pmd_alloc(mm, pud, addr); + pmd_t *pmd = pmd_alloc(mm, pud, addr); if (pmd) { VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); @@ -2035,7 +2055,7 @@ static int do_page_mkwrite(struct vm_fault *vmf) vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2307,7 +2327,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); @@ -2503,7 +2523,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows? NULL: mapping; + details.check_mapping = even_cows ? NULL : mapping; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) @@ -2861,7 +2881,7 @@ static int __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; int ret; - ret = vma->vm_ops->fault(vma, vmf); + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; @@ -2898,7 +2918,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) atomic_long_inc(&vma->vm_mm->nr_ptes); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); - vmf->prealloc_pte = 0; + vmf->prealloc_pte = NULL; } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } @@ -2946,7 +2966,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - vmf->prealloc_pte = 0; + vmf->prealloc_pte = NULL; } static int do_set_pmd(struct vm_fault *vmf, struct page *page) @@ -3352,7 +3372,7 @@ static int do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vma->vm_mm, vmf->prealloc_pte); - vmf->prealloc_pte = 0; + vmf->prealloc_pte = NULL; } return ret; } @@ -3380,32 +3400,32 @@ static int do_numa_page(struct vm_fault *vmf) int last_cpupid; int target_nid; bool migrated = false; - pte_t pte = vmf->orig_pte; - bool was_writable = pte_write(pte); + pte_t pte; + bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. - * - * We can safely just do a "set_pte_at()", because the old - * page table entry is not accessible, so there would be no - * concurrent hardware modifications to the PTE. - */ + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + */ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, pte))) { + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Make it present again */ + /* + * Make it present again, Depending on how arch implementes non + * accessible ptes, some can allow access by kernel mode. + */ + pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); @@ -3466,8 +3486,8 @@ static int create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); return VM_FAULT_FALLBACK; } @@ -3475,8 +3495,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); @@ -3490,6 +3510,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } +static int create_huge_pud(struct vm_fault *vmf) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3605,22 +3649,46 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; - pud_t *pud; + int ret; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) + + vmf.pud = pud_alloc(mm, pgd, address); + if (!vmf.pud) return VM_FAULT_OOM; - vmf.pmd = pmd_alloc(mm, pud, address); + if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + ret = create_huge_pud(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pud_t orig_pud = *vmf.pud; + + barrier(); + if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + /* NUMA case for anonymous PUDs would go here */ + + if (dirty && !pud_write(orig_pud)) { + ret = wp_huge_pud(&vmf, orig_pud); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pud_set_accessed(&vmf, orig_pud); + return 0; + } + } + } + + vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&vmf); + ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pmd_t orig_pmd = *vmf.pmd; - int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { @@ -3680,14 +3748,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) { mem_cgroup_oom_disable(); - /* - * The task may have entered a memcg OOM situation but - * if the allocation error was handled gracefully (no - * VM_FAULT_OOM), there is no need to kill anything. - * Just clean up the OOM state peacefully. - */ - if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) - mem_cgroup_oom_synchronize(false); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); } /* @@ -3737,13 +3805,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { + spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); + ptl = pud_lock(mm, pud); #ifndef __ARCH_HAS_4LEVEL_HACK if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); @@ -3757,7 +3826,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } else /* Another has populated it */ pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d67787d10ff0..1d3ed58f92ab 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -126,6 +126,8 @@ void put_online_mems(void) void mem_hotplug_begin(void) { + assert_held_device_hotplug(); + mem_hotplug.active_writer = current; memhp_lock_acquire(); @@ -862,7 +864,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, return ret; } -EXPORT_SYMBOL_GPL(__remove_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) @@ -1336,7 +1337,7 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, static int online_memory_block(struct memory_block *mem, void *arg) { - return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + return device_online(&mem->dev); } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ @@ -1508,7 +1509,7 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) i++; - if (i == MAX_ORDER_NR_PAGES) + if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) @@ -1522,7 +1523,7 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, if (zone) { *valid_start = start; - *valid_end = end; + *valid_end = min(end, end_pfn); return 1; } else { return 0; @@ -1530,10 +1531,10 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, } /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages - * and hugepages). We scan pfn because it's much easier than scanning over - * linked list. This function returns the pfn of the first found movable - * page if it's found, otherwise 0. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, + * non-lru movable pages and hugepages). We scan pfn because it's much + * easier than scanning over linked list. This function returns the pfn + * of the first found movable page if it's found, otherwise 0. */ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { @@ -1544,6 +1545,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (__PageMovable(page)) + return pfn; if (PageHuge(page)) { if (page_huge_active(page)) return pfn; @@ -1620,21 +1623,25 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!get_page_unless_zero(page)) continue; /* - * We can skip free pages. And we can only deal with pages on - * LRU. + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } else { #ifdef CONFIG_DEBUG_VM - pr_alert("removing pfn %lx from LRU failed\n", pfn); - dump_page(page, "failed to remove from LRU"); + pr_alert("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); #endif put_page(page); /* Because we don't have big zone->lock. we should diff --git a/mm/migrate.c b/mm/migrate.c index 87f4d0f81819..2c63ac06791b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -74,7 +74,7 @@ int migrate_prep_local(void) return 0; } -bool isolate_movable_page(struct page *page, isolate_mode_t mode) +int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -125,14 +125,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) __SetPageIsolated(page); unlock_page(page); - return true; + return 0; out_no_isolated: unlock_page(page); out_putpage: put_page(page); out: - return false; + return -EBUSY; } /* It should be called on page which is PG_movable */ @@ -193,82 +193,62 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, +static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - struct mm_struct *mm = vma->vm_mm; + struct page_vma_mapped_walk pvmw = { + .page = old, + .vma = vma, + .address = addr, + .flags = PVMW_SYNC | PVMW_MIGRATION, + }; + struct page *new; + pte_t pte; swp_entry_t entry; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - if (unlikely(PageHuge(new))) { - ptep = huge_pte_offset(mm, addr); - if (!ptep) - goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); - } else { - pmd = mm_find_pmd(mm, addr); - if (!pmd) - goto out; + VM_BUG_ON_PAGE(PageTail(page), page); + while (page_vma_mapped_walk(&pvmw)) { + new = page - pvmw.page->index + + linear_page_index(vma, pvmw.address); - ptep = pte_offset_map(pmd, addr); + get_page(new); + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_mksoft_dirty(pte); /* - * Peek to check is_swap_pte() before taking ptlock? No, we - * can race mremap's move_ptes(), which skips anon_vma lock. + * Recheck VMA as permissions can change since migration started */ - - ptl = pte_lockptr(mm, pmd); - } - - spin_lock(ptl); - pte = *ptep; - if (!is_swap_pte(pte)) - goto unlock; - - entry = pte_to_swp_entry(pte); - - if (!is_migration_entry(entry) || - migration_entry_to_page(entry) != old) - goto unlock; - - get_page(new); - pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) - pte = pte_mksoft_dirty(pte); - - /* Recheck VMA as permissions can change since migration started */ - if (is_write_migration_entry(entry)) - pte = maybe_mkwrite(pte, vma); + entry = pte_to_swp_entry(*pvmw.pte); + if (is_write_migration_entry(entry)) + pte = maybe_mkwrite(pte, vma); #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) { - pte = pte_mkhuge(pte); - pte = arch_make_huge_pte(pte, vma, new, 0); - } + if (PageHuge(new)) { + pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, vma, new, 0); + } #endif - flush_dcache_page(new); - set_pte_at(mm, addr, ptep, pte); + flush_dcache_page(new); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageHuge(new)) { - if (PageAnon(new)) - hugepage_add_anon_rmap(new, vma, addr); + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, pvmw.address); + else + page_dup_rmap(new, true); + } else if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); else - page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr, false); - else - page_add_file_rmap(new, false); + page_add_file_rmap(new, false); - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) + mlock_vma_page(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, pvmw.address, pvmw.pte); + } - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, ptep); -unlock: - pte_unmap_unlock(ptep, ptl); -out: return SWAP_AGAIN; } diff --git a/mm/mincore.c b/mm/mincore.c index ddb872da3f5b..c5687c45c326 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,6 +14,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/uaccess.h> diff --git a/mm/mmap.c b/mm/mmap.c index b729084eea90..499b988b1639 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk(unsigned long addr, unsigned long len); +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) struct mm_struct *mm = current->mm; unsigned long min_brk; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Always allow shrinking brk. */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) goto set_brk; goto out; } @@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) < 0) + if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) goto out; set_brk: mm->brk = brk; populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; @@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm, unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate) + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; @@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -2495,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * __split_vma() bypasses sysctl_max_map_count checking. We use this on the - * munmap path where it doesn't make sense to fail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. */ -static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; int err; @@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge <jeremy@goop.org> */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (vma->vm_start >= end) return 0; + if (uf) { + int error = userfaultfd_unmap_prep(vma, start, end, uf); + + if (error) + return error; + } + /* * If we need to split any vma, do it now to save pain later. * @@ -2668,27 +2680,22 @@ int vm_munmap(unsigned long start, size_t len) { int ret; struct mm_struct *mm = current->mm; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len); + ret = do_munmap(mm, start, len, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { - int ret; - struct mm_struct *mm = current->mm; - profile_munmap(addr); - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; + return vm_munmap(addr, len); } @@ -2780,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, file = get_file(vma->vm_file); ret = do_mmap_pgoff(vma->vm_file, start, size, - prot, flags, pgoff, &populate); + prot, flags, pgoff, &populate, NULL); fput(file); out: up_write(&mm->mmap_sem); @@ -2806,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -2845,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -2892,9 +2899,9 @@ out: return 0; } -static int do_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) { - return do_brk_flags(addr, len, 0); + return do_brk_flags(addr, len, 0, uf); } int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) @@ -2902,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) struct mm_struct *mm = current->mm; int ret; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk_flags(addr, len, flags); + ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; @@ -3125,8 +3134,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf); +static int special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3161,9 +3169,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int special_mapping_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; @@ -3173,7 +3181,7 @@ static int special_mapping_fault(struct vm_area_struct *vma, struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) - return sm->fault(sm, vma, vmf); + return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; } @@ -3447,7 +3455,7 @@ void mm_drop_all_locks(struct mm_struct *mm) } /* - * initialise the VMA slab + * initialise the percpu counter for VM */ void __init mmap_init(void) { diff --git a/mm/mprotect.c b/mm/mprotect.c index a45b4dc6a7f5..848e946b08e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -99,7 +99,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) - ptent = pte_mkwrite(ptent); + ptent = pte_mk_savedwrite(ptent); /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && diff --git a/mm/mremap.c b/mm/mremap.c index 8779928d6a70..8233b0105c82 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr, - bool *locked, struct vm_userfaultfd_ctx *uf) + bool *locked, struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_moved(vma); - if (do_munmap(mm, old_addr, old_len) < 0) { + if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; @@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, - struct vm_userfaultfd_ctx *uf) + struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; - ret = do_munmap(mm, new_addr, new_len); + ret = do_munmap(mm, new_addr, new_len, NULL); if (ret) goto out; if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); if (ret && old_len != new_len) goto out; old_len = new_len; @@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (offset_in_page(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, + uf_unmap); if (!(offset_in_page(ret))) goto out; out1: @@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, &uf); + &locked, &uf, &uf_unmap); goto out; } @@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); if (ret && old_len != new_len) goto out; ret = addr; @@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } ret = move_vma(vma, addr, old_len, new_len, new_addr, - &locked, &uf); + &locked, &uf, &uf_unmap); } out: if (offset_in_page(ret)) { @@ -609,5 +613,6 @@ out: if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); + userfaultfd_unmap_complete(mm, &uf_unmap); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index bc964c26be8c..fe9f4fa4a7a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -517,7 +517,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* - * initialise the VMA and region record slabs + * initialise the percpu counter for VM and region record slabs */ void __init mmap_init(void) { @@ -1205,7 +1205,8 @@ unsigned long do_mmap(struct file *file, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, - unsigned long *populate) + unsigned long *populate, + struct list_head *uf) { struct vm_area_struct *vma; struct vm_region *region; @@ -1577,7 +1578,7 @@ static int shrink_vma(struct mm_struct *mm, * - under NOMMU conditions the chunk to be unmapped must be backed by a single * VMA, though it need not cover the whole VMA */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { struct vm_area_struct *vma; unsigned long end; @@ -1643,7 +1644,7 @@ int vm_munmap(unsigned long addr, size_t len) int ret; down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); + ret = do_munmap(mm, addr, len, NULL); up_write(&mm->mmap_sem); return ret; } @@ -1794,7 +1795,7 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8256788ac119..578321f1c070 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -403,12 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; - - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, - nodemask_pr_args(nm), oc->order, - current->signal->oom_score_adj); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", + current->comm, oc->gfp_mask, &oc->gfp_mask); + if (oc->nodemask) + pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); + else + pr_cont("(null)"); + pr_cont(", order=%d, oom_score_adj=%hd\n", + oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); @@ -417,7 +419,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (oc->memcg) mem_cgroup_print_oom_info(oc->memcg, p); else - show_mem(SHOW_MEM_FILTER_NODES, nm); + show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 216449825859..ae6e601f0a58 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -580,7 +580,7 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, __fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(dom->period_time)) { + if (unlikely(!dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c21b33668133..9f9623d690d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -59,7 +59,6 @@ #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> -#include <linux/page_ext.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/page_owner.h> @@ -92,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +/* work_structs for global per-cpu drains */ +DEFINE_MUTEX(pcpu_drain_mutex); +DEFINE_PER_CPU(struct work_struct, pcpu_drain); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -1085,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; + unsigned long nr_scanned, flags; bool isolated_pageblocks; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) @@ -1137,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1145,8 +1148,9 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; - spin_lock(&zone->lock); + unsigned long nr_scanned, flags; + spin_lock_irqsave(&zone->lock, flags); + __count_vm_events(PGFREE, 1 << order); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); @@ -1156,7 +1160,7 @@ static void free_one_page(struct zone *zone, migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1234,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { - unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1242,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); - local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -2217,8 +2217,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; + unsigned long flags; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2254,7 +2255,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return alloced; } @@ -2339,16 +2340,26 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } +static void drain_local_pages_wq(struct work_struct *work) +{ + /* + * drain_all_pages doesn't use proper cpu hotplug protection so + * we can race with cpu offline when the WQ can move this from + * a cpu pinned worker to an unbound one. We can operate on a different + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ + preempt_disable(); + drain_local_pages(NULL); + preempt_enable(); +} + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. * - * Note that this code is protected against sending an IPI to an offline - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but - * nothing keeps CPUs from showing up after we populated the cpumask and - * before the call to on_each_cpu_mask(). + * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -2360,6 +2371,21 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* Workqueues cannot recurse */ + if (current->flags & PF_WQ_WORKER) + return; + + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } + /* * We don't care about racing with CPU hotplug event * as offline notification will cause the notified @@ -2390,8 +2416,16 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); + INIT_WORK(work, drain_local_pages_wq); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + + mutex_unlock(&pcpu_drain_mutex); } #ifdef CONFIG_HIBERNATION @@ -2442,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + if (in_interrupt()) { + __free_pages_ok(page, 0); + return; + } + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); - __count_vm_event(PGFREE); + preempt_disable(); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -2469,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } + __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2482,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - local_irq_restore(flags); + preempt_enable(); } /* @@ -2600,74 +2638,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif } +/* Remove page from the per-cpu list, caller must protect the list */ +static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + bool cold, struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + VM_BUG_ON(in_interrupt()); + + do { + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + return NULL; + } + + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } while (check_new_pcp(page)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + struct page *page; + + preempt_disable(); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); + } + preempt_enable(); + return page; +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline -struct page *buffered_rmqueue(struct zone *preferred_zone, +struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { unsigned long flags; struct page *page; - bool cold = ((gfp_flags & __GFP_COLD) != 0); - if (likely(order == 0)) { - struct per_cpu_pages *pcp; - struct list_head *list; - - local_irq_save(flags); - do { - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } - - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - - list_del(&page->lru); - pcp->count--; + if (likely(order == 0) && !in_interrupt()) { + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype); + goto out; + } - } while (check_new_pcp(page)); - } else { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + spin_lock_irqsave(&zone->lock, flags); - do { - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); - } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } if (!page) - goto failed; - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - } + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - VM_BUG_ON_PAGE(bad_range(zone, page), page); +out: + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; failed: @@ -2875,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, #ifdef CONFIG_NUMA static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -2972,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -3825,76 +3894,76 @@ got_pg: return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_mask, + unsigned int *alloc_flags) { - struct page *page; - unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ - struct alloc_context ac = { - .high_zoneidx = gfp_zone(gfp_mask), - .zonelist = zonelist, - .nodemask = nodemask, - .migratetype = gfpflags_to_migratetype(gfp_mask), - }; + ac->high_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = zonelist; + ac->nodemask = nodemask; + ac->migratetype = gfpflags_to_migratetype(gfp_mask); if (cpusets_enabled()) { - alloc_mask |= __GFP_HARDWALL; - alloc_flags |= ALLOC_CPUSET; - if (!ac.nodemask) - ac.nodemask = &cpuset_current_mems_allowed; + *alloc_mask |= __GFP_HARDWALL; + if (!ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; } - gfp_mask &= gfp_allowed_mask; - lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + return false; - /* - * Check the zones suitable for the gfp_mask contain at least one - * valid zone. It's possible to have an empty zonelist as a result - * of __GFP_THISNODE and a memoryless node - */ - if (unlikely(!zonelist->_zonerefs->zone)) - return NULL; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; - if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; + return true; +} +/* Determine whether to spread dirty pages and what the first usable zone */ +static inline void finalise_ac(gfp_t gfp_mask, + unsigned int order, struct alloc_context *ac) +{ /* Dirty zone balancing only done in the fast path */ - ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ - ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, - ac.high_zoneidx, ac.nodemask); - if (!ac.preferred_zoneref->zone) { - page = NULL; - /* - * This might be due to race with cpuset_current_mems_allowed - * update, so make sure we retry with original nodemask in the - * slow path. - */ - goto no_zone; - } + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + gfp_mask &= gfp_allowed_mask; + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + return NULL; + + finalise_ac(gfp_mask, order, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) goto out; -no_zone: /* * Runtime PM, block IO and its error handling path can deadlock * because I/O on the device might not complete. @@ -6420,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) start_pfn = end_pfn; } - arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; - arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); @@ -7157,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -7214,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -7325,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7338,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7350,7 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = GFP_KERNEL, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_idle.c b/mm/page_idle.c index ae11aa914e55..b0ee56c56b58 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -54,27 +54,27 @@ static int page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; bool referenced = false; - if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) - return SWAP_AGAIN; - - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap(pte); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); - } else { - /* unexpected pmd-mapped page? */ - WARN_ON_ONCE(1); + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + referenced = ptep_clear_young_notify(vma, addr, + pvmw.pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + referenced = pmdp_clear_young_notify(vma, addr, + pvmw.pmd); + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); + } } - spin_unlock(ptl); - if (referenced) { clear_page_idle(page); /* diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c new file mode 100644 index 000000000000..a23001a22c15 --- /dev/null +++ b/mm/page_vma_mapped.c @@ -0,0 +1,218 @@ +#include <linux/mm.h> +#include <linux/rmap.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include "internal.h" + +static inline bool check_pmd(struct page_vma_mapped_walk *pvmw) +{ + pmd_t pmde; + /* + * Make sure we don't re-load pmd between present and !trans_huge check. + * We need a consistent view. + */ + pmde = READ_ONCE(*pvmw->pmd); + return pmd_present(pmde) && !pmd_trans_huge(pmde); +} + +static inline bool not_found(struct page_vma_mapped_walk *pvmw) +{ + page_vma_mapped_walk_done(pvmw); + return false; +} + +static bool map_pte(struct page_vma_mapped_walk *pvmw) +{ + pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address); + if (!(pvmw->flags & PVMW_SYNC)) { + if (pvmw->flags & PVMW_MIGRATION) { + if (!is_swap_pte(*pvmw->pte)) + return false; + } else { + if (!pte_present(*pvmw->pte)) + return false; + } + } + pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); + spin_lock(pvmw->ptl); + return true; +} + +static bool check_pte(struct page_vma_mapped_walk *pvmw) +{ + if (pvmw->flags & PVMW_MIGRATION) { +#ifdef CONFIG_MIGRATION + swp_entry_t entry; + if (!is_swap_pte(*pvmw->pte)) + return false; + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) + return false; + if (migration_entry_to_page(entry) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (migration_entry_to_page(entry) < pvmw->page) + return false; +#else + WARN_ON_ONCE(1); +#endif + } else { + if (!pte_present(*pvmw->pte)) + return false; + + /* THP can be referenced by any subpage */ + if (pte_page(*pvmw->pte) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (pte_page(*pvmw->pte) < pvmw->page) + return false; + } + + return true; +} + +/** + * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * @pvmw->address + * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags + * must be set. pmd, pte and ptl must be NULL. + * + * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point + * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is + * adjusted if needed (for PTE-mapped THPs). + * + * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page + * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in + * a loop to find all PTEs that map the THP. + * + * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry + * regardless of which page table level the page is mapped at. @pvmw->pmd is + * NULL. + * + * Retruns false if there are no more page table entries for the page in + * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. + * + * If you need to stop the walk before page_vma_mapped_walk() returned false, + * use page_vma_mapped_walk_done(). It will do the housekeeping. + */ +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) +{ + struct mm_struct *mm = pvmw->vma->vm_mm; + struct page *page = pvmw->page; + pgd_t *pgd; + pud_t *pud; + + /* The only possible pmd mapping has been handled on last iteration */ + if (pvmw->pmd && !pvmw->pte) + return not_found(pvmw); + + /* Only for THP, seek to next pte entry makes sense */ + if (pvmw->pte) { + if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + return not_found(pvmw); + goto next_pte; + } + + if (unlikely(PageHuge(pvmw->page))) { + /* when pud is not present, pte will be NULL */ + pvmw->pte = huge_pte_offset(mm, pvmw->address); + if (!pvmw->pte) + return false; + + pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + spin_lock(pvmw->ptl); + if (!check_pte(pvmw)) + return not_found(pvmw); + return true; + } +restart: + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) + return false; + pud = pud_offset(pgd, pvmw->address); + if (!pud_present(*pud)) + return false; + pvmw->pmd = pmd_offset(pud, pvmw->address); + if (pmd_trans_huge(*pvmw->pmd)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + if (!pmd_present(*pvmw->pmd)) + return not_found(pvmw); + if (likely(pmd_trans_huge(*pvmw->pmd))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (pmd_page(*pvmw->pmd) != page) + return not_found(pvmw); + return true; + } else { + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + } else { + if (!check_pmd(pvmw)) + return false; + } + if (!map_pte(pvmw)) + goto next_pte; + while (1) { + if (check_pte(pvmw)) + return true; +next_pte: do { + pvmw->address += PAGE_SIZE; + if (pvmw->address >= + __vma_address(pvmw->page, pvmw->vma) + + hpage_nr_pages(pvmw->page) * PAGE_SIZE) + return not_found(pvmw); + /* Did we cross page table boundary? */ + if (pvmw->address % PMD_SIZE == 0) { + pte_unmap(pvmw->pte); + if (pvmw->ptl) { + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + goto restart; + } else { + pvmw->pte++; + } + } while (pte_none(*pvmw->pte)); + + if (!pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } + } +} + +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .flags = PVMW_SYNC, + }; + unsigned long start, end; + + start = __vma_address(page, vma); + end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); + + if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + return 0; + pvmw.address = max(start, vma->vm_start); + if (!page_vma_mapped_walk(&pvmw)) + return 0; + page_vma_mapped_walk_done(&pvmw); + return 1; +} diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 207244489a68..03761577ae86 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_offset(pgd, addr); do { + again: next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { + if (pud_none(*pud) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + + if (walk->pud_entry) { + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + err = walk->pud_entry(pud, addr, next, walk); + spin_unlock(ptl); + if (err) + break; + continue; + } + } + + split_huge_pud(walk->vma, pud, addr); + if (pud_none(*pud)) + goto again; + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 71c5f9109f2a..4ed5908c65b0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + pud_t pud; + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return pud; +} +#endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/mm/rmap.c b/mm/rmap.c index 91619fd70939..8774791e2809 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -607,8 +607,7 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, - struct page *page, bool writable) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; @@ -643,8 +642,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } #else -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, - struct page *page, bool writable) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { } @@ -710,170 +708,6 @@ out: return pmd; } -/* - * Check that @page is mapped at @address into @mm. - * - * If @sync is false, page_check_address may perform a racy check to avoid - * the page table lock when the pte is not present (helpful when reclaiming - * highly shared pages). - * - * On success returns with pte mapped and locked. - */ -pte_t *__page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp, int sync) -{ - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return NULL; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - goto check; - } - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return NULL; - - pte = pte_offset_map(pmd, address); - /* Make a quick check before getting the lock */ - if (!sync && !pte_present(*pte)) { - pte_unmap(pte); - return NULL; - } - - ptl = pte_lockptr(mm, pmd); -check: - spin_lock(ptl); - if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { - *ptlp = ptl; - return pte; - } - pte_unmap_unlock(pte, ptl); - return NULL; -} - -/** - * page_mapped_in_vma - check whether a page is really mapped in a VMA - * @page: the page to test - * @vma: the VMA to test - * - * Returns 1 if the page is mapped into the page tables of the VMA, 0 - * if the page is not mapped into the page tables of this VMA. Only - * valid for normal file or anonymous VMAs. - */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) -{ - unsigned long address; - pte_t *pte; - spinlock_t *ptl; - - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - return 0; - pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); - if (!pte) /* the page is not in this mm */ - return 0; - pte_unmap_unlock(pte, ptl); - - return 1; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * Check that @page is mapped at @address into @mm. In contrast to - * page_check_address(), this function can handle transparent huge pages. - * - * On success returns true with pte mapped and locked. For PMD-mapped - * transparent huge pages *@ptep is set to NULL. - */ -bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, - unsigned long address, pmd_t **pmdp, - pte_t **ptep, spinlock_t **ptlp) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return false; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - pmd = NULL; - goto check_pte; - } - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return false; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return false; - pmd = pmd_offset(pud, address); - - if (pmd_trans_huge(*pmd)) { - ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock_pmd; - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - goto map_pte; - } - - if (pmd_page(*pmd) != page) - goto unlock_pmd; - - pte = NULL; - goto found; -unlock_pmd: - spin_unlock(ptl); - return false; - } else { - pmd_t pmde = *pmd; - - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - return false; - } -map_pte: - pte = pte_offset_map(pmd, address); - if (!pte_present(*pte)) { - pte_unmap(pte); - return false; - } - - ptl = pte_lockptr(mm, pmd); -check_pte: - spin_lock(ptl); - - if (!pte_present(*pte)) { - pte_unmap_unlock(pte, ptl); - return false; - } - - /* THP can be referenced by any subpage */ - if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { - pte_unmap_unlock(pte, ptl); - return false; - } -found: - *ptep = pte; - *pmdp = pmd; - *ptlp = ptl; - return true; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - struct page_referenced_arg { int mapcount; int referenced; @@ -886,45 +720,48 @@ struct page_referenced_arg { static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct mm_struct *mm = vma->vm_mm; struct page_referenced_arg *pra = arg; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; int referenced = 0; - if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) - return SWAP_AGAIN; + while (page_vma_mapped_walk(&pvmw)) { + address = pvmw.address; - if (vma->vm_flags & VM_LOCKED) { - if (pte) - pte_unmap(pte); - spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + if (vma->vm_flags & VM_LOCKED) { + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } - if (pte) { - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) + if (pvmw.pte) { + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* + * Don't treat a reference through + * a sequentially read mapping as such. + * If the page has been used in another mapping, + * we will catch it; if this other mapping is + * already gone, the unmap path will have set + * PG_referenced or activated the page. + */ + if (likely(!(vma->vm_flags & VM_SEQ_READ))) + referenced++; + } + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, + pvmw.pmd)) referenced++; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } - pte_unmap(pte); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (pmdp_clear_flush_young_notify(vma, address, pmd)) - referenced++; - } else { - /* unexpected pmd-mapped page? */ - WARN_ON_ONCE(1); + + pra->mapcount--; } - spin_unlock(ptl); if (referenced) clear_page_idle(page); @@ -936,7 +773,6 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pra->vm_flags |= vma->vm_flags; } - pra->mapcount--; if (!pra->mapcount) return SWAP_SUCCESS; /* To break the loop */ @@ -1015,34 +851,56 @@ int page_referenced(struct page *page, static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; - int ret = 0; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + .flags = PVMW_SYNC, + }; int *cleaned = arg; - pte = page_check_address(page, mm, address, &ptl, 1); - if (!pte) - goto out; - - if (pte_dirty(*pte) || pte_write(*pte)) { - pte_t entry; + while (page_vma_mapped_walk(&pvmw)) { + int ret = 0; + address = pvmw.address; + if (pvmw.pte) { + pte_t entry; + pte_t *pte = pvmw.pte; + + if (!pte_dirty(*pte) && !pte_write(*pte)) + continue; + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(vma->vm_mm, address, pte, entry); + ret = 1; + } else { +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + pmd_t *pmd = pvmw.pmd; + pmd_t entry; + + if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) + continue; + + flush_cache_page(vma, address, page_to_pfn(page)); + entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmd_wrprotect(entry); + entry = pmd_mkclean(entry); + set_pmd_at(vma->vm_mm, address, pmd, entry); + ret = 1; +#else + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); +#endif + } - flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); - entry = pte_wrprotect(entry); - entry = pte_mkclean(entry); - set_pte_at(mm, address, pte, entry); - ret = 1; + if (ret) { + mmu_notifier_invalidate_page(vma->vm_mm, address); + (*cleaned)++; + } } - pte_unmap_unlock(pte, ptl); - - if (ret) { - mmu_notifier_invalidate_page(mm, address); - (*cleaned)++; - } -out: return SWAP_AGAIN; } @@ -1435,155 +1293,163 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; pte_t pteval; - spinlock_t *ptl; + struct page *subpage; int ret = SWAP_AGAIN; struct rmap_private *rp = arg; enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - goto out; + return SWAP_AGAIN; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, flags & TTU_MIGRATION, page); - /* check if we have anything to do after split */ - if (page_mapcount(page) == 0) - goto out; } - pte = page_check_address(page, mm, address, &ptl, - PageTransCompound(page)); - if (!pte) - goto out; + while (page_vma_mapped_walk(&pvmw)) { + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; - /* - * If the page is mlock()d, we cannot swap it out. - * If it's recently referenced (perhaps page_referenced - * skipped over this mm) then we should reactivate it. - */ - if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - /* PTE-mapped THP are never mlocked */ - if (!PageTransCompound(page)) { - /* - * Holding pte lock, we do *not* need - * mmap_sem here - */ - mlock_vma_page(page); - } - ret = SWAP_MLOCK; - goto out_unmap; - } - if (flags & TTU_MUNLOCK) - goto out_unmap; - } - if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, address, pte)) { - ret = SWAP_FAIL; - goto out_unmap; - } - } + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); - /* Nuke the page table entry. */ - flush_cache_page(vma, address, page_to_pfn(page)); - if (should_defer_flush(mm, flags)) { /* - * We clear the PTE but do not flush so potentially a remote - * CPU could still be writing to the page. If the entry was - * previously clean then the architecture must guarantee that - * a clear->dirty transition on a cached TLB entry is written - * through and traps if the PTE is unmapped. + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. */ - pteval = ptep_get_and_clear(mm, address, pte); - - set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); - } else { - pteval = ptep_clear_flush(vma, address, pte); - } + if (!(flags & TTU_IGNORE_MLOCK)) { + if (vma->vm_flags & VM_LOCKED) { + /* PTE-mapped THP are never mlocked */ + if (!PageTransCompound(page)) { + /* + * Holding pte lock, we do *not* need + * mmap_sem here + */ + mlock_vma_page(page); + } + ret = SWAP_MLOCK; + page_vma_mapped_walk_done(&pvmw); + break; + } + if (flags & TTU_MUNLOCK) + continue; + } - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); + if (!(flags & TTU_IGNORE_ACCESS)) { + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + ret = SWAP_FAIL; + page_vma_mapped_walk_done(&pvmw); + break; + } + } - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the page. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (PageHuge(page)) { - hugetlb_count_sub(1 << compound_order(page), mm); + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); } else { - dec_mm_counter(mm, mm_counter(page)); + pteval = ptep_clear_flush(vma, address, pvmw.pte); } - set_pte_at(mm, address, pte, - swp_entry_to_pte(make_hwpoison_entry(page))); - } else if (pte_unused(pteval)) { - /* - * The guest indicated that the page content is of no - * interest anymore. Simply discard the pte, vmscan - * will take care of the rest. - */ - dec_mm_counter(mm, mm_counter(page)); - } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { - swp_entry_t entry; - pte_t swp_pte; - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - entry = make_migration_entry(page, pte_write(pteval)); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, address, pte, swp_pte); - } else if (PageAnon(page)) { - swp_entry_t entry = { .val = page_private(page) }; - pte_t swp_pte; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; - } + /* Move the dirty bit to the page. Now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); - if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pte, pteval); - ret = SWAP_FAIL; - goto out_unmap; - } - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, MM_ANONPAGES); - inc_mm_counter(mm, MM_SWAPENTS); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, address, pte, swp_pte); - } else - dec_mm_counter(mm, mm_counter_file(page)); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); -discard: - page_remove_rmap(page, PageHuge(page)); - put_page(page); + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHuge(page)) { + int nr = 1 << compound_order(page); + hugetlb_count_sub(nr, mm); + } else { + dec_mm_counter(mm, mm_counter(page)); + } + + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); + set_pte_at(mm, address, pvmw.pte, pteval); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + dec_mm_counter(mm, mm_counter(page)); + } else if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION)) { + swp_entry_t entry; + pte_t swp_pte; + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(subpage, + pte_write(pteval)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + } else if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(subpage) }; + pte_t swp_pte; + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } -out_unmap: - pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pvmw.pte, pteval); + ret = SWAP_FAIL; + page_vma_mapped_walk_done(&pvmw); + break; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + } else + dec_mm_counter(mm, mm_counter_file(page)); +discard: + page_remove_rmap(subpage, PageHuge(page)); + put_page(page); mmu_notifier_invalidate_page(mm, address); -out: + } return ret; } @@ -1608,7 +1474,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) static int page_mapcount_is_zero(struct page *page) { - return !page_mapcount(page); + return !total_mapcount(page); } /** @@ -1755,7 +1621,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; - pgoff_t pgoff; + pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1769,8 +1635,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!anon_vma) return ret; - pgoff = page_to_pgoff(page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { + pgoff_start = page_to_pgoff(page); + pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1808,7 +1676,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); - pgoff_t pgoff; + pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1823,10 +1691,12 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (!mapping) return ret; - pgoff = page_to_pgoff(page); + pgoff_start = page_to_pgoff(page); + pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; if (!locked) i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, + pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); cond_resched(); diff --git a/mm/shmem.c b/mm/shmem.c index 9c6d22ff44e2..a26649a6633f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1908,8 +1908,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync return ret; } -static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int shmem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; @@ -2330,7 +2331,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals)) { + if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { if (info->seals & F_SEAL_WRITE) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) diff --git a/mm/slab_common.c b/mm/slab_common.c index 23ff74e61838..09d0e849b07f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -528,6 +528,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) static int shutdown_cache(struct kmem_cache *s) { + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; @@ -816,7 +819,6 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); - kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; diff --git a/mm/swap.c b/mm/swap.c index aabf2e90fe32..c4910f14f957 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -209,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, { int *pgmoved = arg; - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &lruvec->lists[lru]); + if (PageLRU(page) && !PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec, page_lru(page)); + ClearPageActive(page); + add_page_to_lru_list_tail(page, lruvec, page_lru(page)); (*pgmoved)++; } } @@ -235,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec) */ void rotate_reclaimable_page(struct page *page) { - if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + if (!PageLocked(page) && !PageDirty(page) && !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; diff --git a/mm/truncate.c b/mm/truncate.c index dd7b24e083c5..f2db67465495 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include <linux/shmem_fs.h> #include <linux/cleancache.h> #include <linux/rmap.h> #include "internal.h" diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 1e5c2f94e8a3..9f0ad2a4f102 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -197,22 +197,25 @@ retry: * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { - err = -EINVAL; + err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; - - if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) - goto out_unlock; - /* - * Make sure the remaining dst range is both valid and - * fully within a single existing vma. + * Only allow __mcopy_atomic_hugetlb on userfaultfd + * registered ranges. */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; + err = -EINVAL; + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock; + vm_shared = dst_vma->vm_flags & VM_SHARED; } @@ -221,12 +224,6 @@ retry: goto out_unlock; /* - * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - - /* * If not shared, ensure the dst_vma has a anon_vma. */ err = -ENOMEM; @@ -404,22 +401,35 @@ retry: * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ - err = -EINVAL; + err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma) goto out_unlock; /* - * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but - * it will overwrite vm_ops, so vma_is_anonymous must return false. + * Be strict and only allow __mcopy_atomic on userfaultfd + * registered ranges to prevent userland errors going + * unnoticed. As far as the VM consistency is concerned, it + * would be perfectly safe to remove this check, but there's + * no useful usage for __mcopy_atomic ouside of userfaultfd + * registered ranges. This is after all why these are ioctls + * belonging to the userfaultfd and not syscalls. */ - if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && - dst_vma->vm_flags & VM_SHARED)) + if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; + err = -EINVAL; + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + /* * If this is a HUGETLB vma, pass off to appropriate routine */ @@ -427,18 +437,6 @@ retry: return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, src_start, len, zeropage); - /* - * Be strict and only allow __mcopy_atomic on userfaultfd - * registered ranges to prevent userland errors going - * unnoticed. As far as the VM consistency is concerned, it - * would be perfectly safe to remove this check, but there's - * no useful usage for __mcopy_atomic ouside of userfaultfd - * registered ranges. This is after all why these are ioctls - * belonging to the userfaultfd and not syscalls. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; diff --git a/mm/util.c b/mm/util.c index 3cb2164f4099..b8f538863b5a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -11,6 +11,7 @@ #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> +#include <linux/userfaultfd_k.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; + LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (down_write_killable(&mm->mmap_sem)) return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate); + &populate, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d89034a393f2..be93949b4885 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1642,6 +1642,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; + if (fatal_signal_pending(current)) { + area->nr_pages = i; + goto fail; + } + if (node == NUMA_NO_NODE) page = alloc_page(alloc_mask); else @@ -2654,7 +2659,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) - seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); + seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 149fdf6c5c56..6063581f705c 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -112,9 +112,16 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; - unsigned long pressure; + unsigned long pressure = 0; /* + * reclaimed can be greater than scanned in cases + * like THP, where the scanned is 1 and reclaimed + * could be 512 + */ + if (reclaimed >= scanned) + goto out; + /* * We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages @@ -124,6 +131,7 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale; +out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7bb23ff229b6..70aa739c6b68 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -87,6 +87,7 @@ struct scan_control { /* The highest zone to isolate pages for reclaim from */ enum zone_type reclaim_idx; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -1055,6 +1056,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the pages to get them out of + * the way while we continue scanning for clean pages on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. */ if (PageWriteback(page)) { /* Case 1 above */ @@ -1062,7 +1072,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; - goto keep_locked; + goto activate_locked; /* Case 2 above */ } else if (sane_reclaim(sc) || @@ -1080,7 +1090,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; + goto activate_locked; /* Case 3 above */ } else { @@ -1152,13 +1162,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. + * Only kswapd can writeback filesystem pages + * to avoid risk of stack overflow. But avoid + * injecting inefficient single-page IO into + * flusher writeback as much as possible: only + * write pages when we've encountered many + * dirty pages, and when we've already scanned + * the rest of the LRU for clean pages and see + * the same dirty pages again (PageReclaim). */ if (page_is_file_cache(page) && - (!current_is_kswapd() || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + (!current_is_kswapd() || !PageReclaim(page) || + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1168,7 +1183,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); - goto keep_locked; + goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) @@ -1373,13 +1388,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * wants to isolate pages it will be able to operate on without * blocking - clean pages for the most part. * - * ISOLATE_CLEAN means that only clean pages should be isolated. This - * is used by reclaim when it is cannot write to backing storage - * * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages * that it is possible to migrate without blocking */ - if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) return ret; @@ -1387,10 +1399,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; - /* ISOLATE_CLEAN means only clean pages */ - if (mode & ISOLATE_CLEAN) - return ret; - /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate @@ -1731,8 +1739,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -1806,12 +1812,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, /* * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the pgdat PGDAT_DIRTY and kswapd will start writing pages from - * reclaim context. + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep, but + * also allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(0, WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); + } /* * If kswapd scans pages marked marked for immediate @@ -1929,8 +1943,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -2759,8 +2771,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; - unsigned long total_scanned = 0; - unsigned long writeback_threshold; retry: delayacct_freepages_start(); @@ -2773,7 +2783,6 @@ retry: sc->nr_scanned = 0; shrink_zones(zonelist, sc); - total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; @@ -2786,20 +2795,6 @@ retry: */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; - - /* - * Try to write back as many pages as we just scanned. This - * tends to cause slow streaming writers to write data to the - * disk smoothly, at the dirtying rate, which is nice. But - * that's undesirable in laptop mode, where we *want* lumpy - * writeout. So in laptop mode, write out the whole world. - */ - writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; - if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); - sc->may_writepage = 1; - } } while (--sc->priority >= 0); delayacct_freepages_end(); @@ -3101,6 +3096,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); return true; } diff --git a/mm/workingset.c b/mm/workingset.c index a67f5796b995..79ed5364375d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -6,6 +6,7 @@ #include <linux/memcontrol.h> #include <linux/writeback.h> +#include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> diff --git a/mm/z3fold.c b/mm/z3fold.c index 207e5ddc87a2..8970a2fd3b1a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,29 +34,62 @@ /***************** * Structures *****************/ +struct z3fold_pool; +struct z3fold_ops { + int (*evict)(struct z3fold_pool *pool, unsigned long handle); +}; + +enum buddy { + HEADLESS = 0, + FIRST, + MIDDLE, + LAST, + BUDDIES_MAX +}; + +/* + * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * z3fold page, except for HEADLESS pages + * @buddy: links the z3fold page into the relevant list in the pool + * @page_lock: per-page lock + * @refcount: reference cound for the z3fold page + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @middle_chunks: the size of the middle buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + * @first_num: the starting number (for the first handle) + */ +struct z3fold_header { + struct list_head buddy; + spinlock_t page_lock; + struct kref refcount; + unsigned short first_chunks; + unsigned short middle_chunks; + unsigned short last_chunks; + unsigned short start_middle; + unsigned short first_num:2; +}; + /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively * adjusting internal fragmentation. It also determines the number of * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk - * in allocated page is occupied by z3fold header, NCHUNKS will be calculated - * to 63 which shows the max number of free chunks in z3fold page, also there - * will be 63 freelists per pool. + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. */ #define NCHUNKS_ORDER 6 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) #define BUDDY_MASK (0x3) -struct z3fold_pool; -struct z3fold_ops { - int (*evict)(struct z3fold_pool *pool, unsigned long handle); -}; - /** * struct z3fold_pool - stores metadata for each z3fold pool * @lock: protects all pool fields and first|last_chunk fields of any @@ -64,8 +97,6 @@ struct z3fold_ops { * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; * the lists each z3fold page is added to depends on the size of * its free region. - * @buddied: list tracking the z3fold pages that contain 3 buddies; - * these z3fold pages are full * @lru: list tracking the z3fold pages in LRU order by most recently * added buddy. * @pages_nr: number of z3fold pages in the pool. @@ -78,49 +109,22 @@ struct z3fold_ops { struct z3fold_pool { spinlock_t lock; struct list_head unbuddied[NCHUNKS]; - struct list_head buddied; struct list_head lru; - u64 pages_nr; + atomic64_t pages_nr; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; }; -enum buddy { - HEADLESS = 0, - FIRST, - MIDDLE, - LAST, - BUDDIES_MAX -}; - -/* - * struct z3fold_header - z3fold page metadata occupying the first chunk of each - * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the pool - * @first_chunks: the size of the first buddy in chunks, 0 if free - * @middle_chunks: the size of the middle buddy in chunks, 0 if free - * @last_chunks: the size of the last buddy in chunks, 0 if free - * @first_num: the starting number (for the first handle) - */ -struct z3fold_header { - struct list_head buddy; - unsigned short first_chunks; - unsigned short middle_chunks; - unsigned short last_chunks; - unsigned short start_middle; - unsigned short first_num:2; -}; - /* * Internal z3fold page flags */ enum z3fold_page_flags { - UNDER_RECLAIM = 0, - PAGE_HEADLESS, + PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, }; + /***************** * Helpers *****************/ @@ -140,10 +144,11 @@ static struct z3fold_header *init_z3fold_page(struct page *page) struct z3fold_header *zhdr = page_address(page); INIT_LIST_HEAD(&page->lru); - clear_bit(UNDER_RECLAIM, &page->private); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + spin_lock_init(&zhdr->page_lock); + kref_init(&zhdr->refcount); zhdr->first_chunks = 0; zhdr->middle_chunks = 0; zhdr->last_chunks = 0; @@ -154,9 +159,36 @@ static struct z3fold_header *init_z3fold_page(struct page *page) } /* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct z3fold_header *zhdr) +static void free_z3fold_page(struct page *page) +{ + __free_page(page); +} + +static void release_z3fold_page(struct kref *ref) +{ + struct z3fold_header *zhdr; + struct page *page; + + zhdr = container_of(ref, struct z3fold_header, refcount); + page = virt_to_page(zhdr); + + if (!list_empty(&zhdr->buddy)) + list_del(&zhdr->buddy); + if (!list_empty(&page->lru)) + list_del(&page->lru); + free_z3fold_page(page); +} + +/* Lock a z3fold page */ +static inline void z3fold_page_lock(struct z3fold_header *zhdr) +{ + spin_lock(&zhdr->page_lock); +} + +/* Unlock a z3fold page */ +static inline void z3fold_page_unlock(struct z3fold_header *zhdr) { - __free_page(virt_to_page(zhdr)); + spin_unlock(&zhdr->page_lock); } /* @@ -204,9 +236,10 @@ static int num_free_chunks(struct z3fold_header *zhdr) */ if (zhdr->middle_chunks != 0) { int nfree_before = zhdr->first_chunks ? - 0 : zhdr->start_middle - 1; + 0 : zhdr->start_middle - ZHDR_CHUNKS; int nfree_after = zhdr->last_chunks ? - 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks; + 0 : TOTAL_CHUNKS - + (zhdr->start_middle + zhdr->middle_chunks); nfree = max(nfree_before, nfree_after); } else nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; @@ -236,9 +269,8 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, spin_lock_init(&pool->lock); for_each_unbuddied_list(i, 0) INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->buddied); INIT_LIST_HEAD(&pool->lru); - pool->pages_nr = 0; + atomic64_set(&pool->pages_nr, 0); pool->ops = ops; return pool; } @@ -254,25 +286,58 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) kfree(pool); } +static inline void *mchunk_memmove(struct z3fold_header *zhdr, + unsigned short dst_chunk) +{ + void *beg = zhdr; + return memmove(beg + (dst_chunk << CHUNK_SHIFT), + beg + (zhdr->start_middle << CHUNK_SHIFT), + zhdr->middle_chunks << CHUNK_SHIFT); +} + +#define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); - void *beg = zhdr; + if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) + return 0; /* can't move middle chunk, it's used */ + + if (zhdr->middle_chunks == 0) + return 0; /* nothing to compact */ - if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) && - zhdr->middle_chunks != 0 && - zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { - memmove(beg + ZHDR_SIZE_ALIGNED, - beg + (zhdr->start_middle << CHUNK_SHIFT), - zhdr->middle_chunks << CHUNK_SHIFT); + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* move to the beginning */ + mchunk_memmove(zhdr, ZHDR_CHUNKS); zhdr->first_chunks = zhdr->middle_chunks; zhdr->middle_chunks = 0; zhdr->start_middle = 0; zhdr->first_num++; return 1; } + + /* + * moving data is expensive, so let's only do that if + * there's substantial gain (at least BIG_CHUNK_GAP chunks) + */ + if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && + zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= + BIG_CHUNK_GAP) { + mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; + return 1; + } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && + TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + + zhdr->middle_chunks) >= + BIG_CHUNK_GAP) { + unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - + zhdr->middle_chunks; + mchunk_memmove(zhdr, new_start); + zhdr->start_middle = new_start; + return 1; + } + return 0; } @@ -313,50 +378,63 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, bud = HEADLESS; else { chunks = size_to_chunks(size); - spin_lock(&pool->lock); /* First, try to find an unbuddied z3fold page. */ zhdr = NULL; for_each_unbuddied_list(i, chunks) { - if (!list_empty(&pool->unbuddied[i])) { - zhdr = list_first_entry(&pool->unbuddied[i], + spin_lock(&pool->lock); + zhdr = list_first_entry_or_null(&pool->unbuddied[i], struct z3fold_header, buddy); - page = virt_to_page(zhdr); - if (zhdr->first_chunks == 0) { - if (zhdr->middle_chunks != 0 && - chunks >= zhdr->start_middle) - bud = LAST; - else - bud = FIRST; - } else if (zhdr->last_chunks == 0) + if (!zhdr) { + spin_unlock(&pool->lock); + continue; + } + kref_get(&zhdr->refcount); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + z3fold_page_lock(zhdr); + if (zhdr->first_chunks == 0) { + if (zhdr->middle_chunks != 0 && + chunks >= zhdr->start_middle) bud = LAST; - else if (zhdr->middle_chunks == 0) - bud = MIDDLE; - else { - pr_err("No free chunks in unbuddied\n"); - WARN_ON(1); - continue; - } - list_del(&zhdr->buddy); - goto found; + else + bud = FIRST; + } else if (zhdr->last_chunks == 0) + bud = LAST; + else if (zhdr->middle_chunks == 0) + bud = MIDDLE; + else { + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); + pr_err("No free chunks in unbuddied\n"); + WARN_ON(1); + continue; } + goto found; } bud = FIRST; - spin_unlock(&pool->lock); } /* Couldn't find unbuddied z3fold page, create new one */ page = alloc_page(gfp); if (!page) return -ENOMEM; - spin_lock(&pool->lock); - pool->pages_nr++; + + atomic64_inc(&pool->pages_nr); zhdr = init_z3fold_page(page); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); + spin_lock(&pool->lock); goto headless; } + z3fold_page_lock(zhdr); found: if (bud == FIRST) @@ -365,17 +443,15 @@ found: zhdr->last_chunks = chunks; else { zhdr->middle_chunks = chunks; - zhdr->start_middle = zhdr->first_chunks + 1; + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } + spin_lock(&pool->lock); if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { /* Add to unbuddied list */ freechunks = num_free_chunks(zhdr); list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - } else { - /* Add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); } headless: @@ -387,6 +463,8 @@ headless: *handle = encode_handle(zhdr, bud); spin_unlock(&pool->lock); + if (bud != HEADLESS) + z3fold_page_unlock(zhdr); return 0; } @@ -408,7 +486,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy bud; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); @@ -416,6 +493,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) /* HEADLESS page stored */ bud = HEADLESS; } else { + z3fold_page_lock(zhdr); bud = handle_to_buddy(handle); switch (bud) { @@ -432,38 +510,36 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) default: pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); - spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); return; } } - if (test_bit(UNDER_RECLAIM, &page->private)) { - /* z3fold page is under reclaim, reclaim will free */ - spin_unlock(&pool->lock); - return; - } - - if (bud != HEADLESS) { - /* Remove from existing buddy list */ - list_del(&zhdr->buddy); - } - - if (bud == HEADLESS || - (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 && - zhdr->last_chunks == 0)) { - /* z3fold page is empty, free */ + if (bud == HEADLESS) { + spin_lock(&pool->lock); list_del(&page->lru); - clear_bit(PAGE_HEADLESS, &page->private); - free_z3fold_page(zhdr); - pool->pages_nr--; + spin_unlock(&pool->lock); + free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); } else { - z3fold_compact_page(zhdr); - /* Add to the unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || + zhdr->last_chunks != 0) { + z3fold_compact_page(zhdr); + /* Add to the unbuddied list */ + spin_lock(&pool->lock); + if (!list_empty(&zhdr->buddy)) + list_del(&zhdr->buddy); + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_unlock(&pool->lock); + } + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); } - spin_unlock(&pool->lock); } /** @@ -510,20 +586,25 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || - retries == 0) { + if (!pool->ops || !pool->ops->evict || retries == 0) { spin_unlock(&pool->lock); return -EINVAL; } for (i = 0; i < retries; i++) { + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } page = list_last_entry(&pool->lru, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); - /* Protect z3fold page against free */ - set_bit(UNDER_RECLAIM, &page->private); zhdr = page_address(page); if (!test_bit(PAGE_HEADLESS, &page->private)) { - list_del(&zhdr->buddy); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + kref_get(&zhdr->refcount); + spin_unlock(&pool->lock); + z3fold_page_lock(zhdr); /* * We need encode the handles before unlocking, since * we can race with free that will set @@ -538,13 +619,13 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); + z3fold_page_unlock(zhdr); } else { first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; + spin_unlock(&pool->lock); } - spin_unlock(&pool->lock); - /* Issue the eviction callback(s) */ if (middle_handle) { ret = pool->ops->evict(pool, middle_handle); @@ -562,36 +643,40 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: - spin_lock(&pool->lock); - clear_bit(UNDER_RECLAIM, &page->private); - if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) || - (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 && - zhdr->middle_chunks == 0)) { - /* - * All buddies are now free, free the z3fold page and - * return success. - */ - clear_bit(PAGE_HEADLESS, &page->private); - free_z3fold_page(zhdr); - pool->pages_nr--; - spin_unlock(&pool->lock); - return 0; - } else if (!test_bit(PAGE_HEADLESS, &page->private)) { - if (zhdr->first_chunks != 0 && - zhdr->last_chunks != 0 && - zhdr->middle_chunks != 0) { - /* Full, add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); + if (test_bit(PAGE_HEADLESS, &page->private)) { + if (ret == 0) { + free_z3fold_page(page); + return 0; } else { + spin_lock(&pool->lock); + } + } else { + z3fold_page_lock(zhdr); + if ((zhdr->first_chunks || zhdr->last_chunks || + zhdr->middle_chunks) && + !(zhdr->first_chunks && zhdr->last_chunks && + zhdr->middle_chunks)) { z3fold_compact_page(zhdr); /* add to unbuddied list */ + spin_lock(&pool->lock); freechunks = num_free_chunks(zhdr); list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_unlock(&pool->lock); + } + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page)) { + atomic64_dec(&pool->pages_nr); + return 0; } } - /* add to beginning of LRU */ + /* + * Add to the beginning of LRU. + * Pool lock has to be kept here to ensure the page has + * not already been released + */ list_add(&page->lru, &pool->lru); } spin_unlock(&pool->lock); @@ -615,7 +700,6 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) void *addr; enum buddy buddy; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); @@ -623,6 +707,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) if (test_bit(PAGE_HEADLESS, &page->private)) goto out; + z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: @@ -641,8 +726,9 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) addr = NULL; break; } + + z3fold_page_unlock(zhdr); out: - spin_unlock(&pool->lock); return addr; } @@ -657,31 +743,28 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy buddy; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); - if (test_bit(PAGE_HEADLESS, &page->private)) { - spin_unlock(&pool->lock); + if (test_bit(PAGE_HEADLESS, &page->private)) return; - } + z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); - spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); } /** * z3fold_get_pool_size() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * - * Returns: size in pages of the given pool. The pool lock need not be - * taken to access pages_nr. + * Returns: size in pages of the given pool. */ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) { - return pool->pages_nr; + return atomic64_read(&pool->pages_nr); } /***************** @@ -780,8 +863,8 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { - /* Make sure the z3fold header will fit in one chunk */ - BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED); + /* Make sure the z3fold header is not larger than the page size */ + BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); zpool_register_driver(&z3fold_zpool_driver); return 0; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a1f24989ac23..b7b1fb6c8c21 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -24,7 +24,6 @@ * * Usage of struct page flags: * PG_private: identifies the first component page - * PG_private2: identifies the last component page * PG_owner_priv_1: identifies the huge component page * */ @@ -268,10 +267,6 @@ struct zs_pool { #endif }; -/* - * A zspage's class index and fullness group - * are encoded in its (first)page->mapping - */ #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @@ -938,7 +933,6 @@ static void reset_page(struct page *page) { __ClearPageMovable(page); ClearPagePrivate(page); - ClearPagePrivate2(page); set_page_private(page, 0); page_mapcount_reset(page); ClearPageHugeObject(page); @@ -1085,7 +1079,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page - * has this flag set) and PG_private_2 to identify the last page. + * has this flag set). */ for (i = 0; i < nr_pages; i++) { page = pages[i]; @@ -1100,8 +1094,6 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, } else { prev_page->freelist = page; } - if (i == nr_pages - 1) - SetPagePrivate2(page); prev_page = page; } } |