diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/gup.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 84 | ||||
-rw-r--r-- | mm/kasan/Makefile | 3 | ||||
-rw-r--r-- | mm/kasan/common.c | 82 | ||||
-rw-r--r-- | mm/kasan/tags.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 11 | ||||
-rw-r--r-- | mm/memory-failure.c | 19 | ||||
-rw-r--r-- | mm/memory.c | 26 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 85 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 25 | ||||
-rw-r--r-- | mm/mincore.c | 94 | ||||
-rw-r--r-- | mm/oom_kill.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 40 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 10 | ||||
-rw-r--r-- | mm/slab.c | 21 | ||||
-rw-r--r-- | mm/slab.h | 7 | ||||
-rw-r--r-- | mm/slab_common.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 61 | ||||
-rw-r--r-- | mm/swap.c | 17 | ||||
-rw-r--r-- | mm/usercopy.c | 9 | ||||
-rw-r--r-- | mm/userfaultfd.c | 11 | ||||
-rw-r--r-- | mm/util.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 10 |
29 files changed, 387 insertions, 285 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a8bb8796c6c..72e6d0c55cfa 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { diff --git a/mm/debug.c b/mm/debug.c index 0abb987dad9b..1611cf00a137 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping; bool page_poisoned = PagePoisoned(page); int mapcount; @@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason) goto hex_only; } + mapping = page_mapping(page); + /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to @@ -1674,7 +1674,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || + pmd_devmap(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..afef61656c1e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; - struct address_space *mapping = vma->vm_file->f_mapping; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; @@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_range_init(&range, src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) mmu_notifier_invalidate_range_end(&range); - else - i_mmap_unlock_read(mapping); return ret; } @@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * Check once here for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - retry: page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + /* * Check for page in userfault range */ @@ -3784,18 +3771,14 @@ retry: }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3854,6 +3837,9 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with file truncation. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4066,7 +4034,6 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4301,7 +4268,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; *nr_pages = 0; /* @@ -4671,12 +4639,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space. This is important as we are setting up sharing - * based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4693,6 +4659,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4722,6 +4689,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_write(mapping); return pte; } @@ -4732,7 +4700,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 0a14fcff70ed..5d1065efbd47 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -5,7 +5,10 @@ UBSAN_SANITIZE_generic.o := n UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n +CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_generic.o = -pg +CFLAGS_REMOVE_tags.o = -pg + # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 03d5d1374ca7..09b534fbba17 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, return; } - cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE); - *flags |= SLAB_KASAN; } @@ -349,28 +347,48 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) } /* - * Since it's desirable to only call object contructors once during slab - * allocation, we preassign tags to all such objects. Also preassign tags for - * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports. - * For SLAB allocator we can't preassign tags randomly since the freelist is - * stored as an array of indexes instead of a linked list. Assign tags based - * on objects indexes, so that objects that are next to each other get - * different tags. - * After a tag is assigned, the object always gets allocated with the same tag. - * The reason is that we can't change tags for objects with constructors on - * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor - * code can save the pointer to the object somewhere (e.g. in the object - * itself). Then if we retag it, the old saved pointer will become invalid. + * This function assigns a tag to an object considering the following: + * 1. A cache might have a constructor, which might save a pointer to a slab + * object somewhere (e.g. in the object itself). We preassign a tag for + * each object in caches with constructors during slab creation and reuse + * the same tag each time a particular object is allocated. + * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be + * accessed after being freed. We preassign tags for objects in these + * caches as well. + * 3. For SLAB allocator we can't preassign tags randomly since the freelist + * is stored as an array of indexes instead of a linked list. Assign tags + * based on objects indexes, so that objects that are next to each other + * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new) +static u8 assign_tag(struct kmem_cache *cache, const void *object, + bool init, bool keep_tag) { + /* + * 1. When an object is kmalloc()'ed, two hooks are called: + * kasan_slab_alloc() and kasan_kmalloc(). We assign the + * tag only in the first one. + * 2. We reuse the same tag for krealloc'ed objects. + */ + if (keep_tag) + return get_tag(object); + + /* + * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU + * set, assign a tag when the object is being allocated (init == false). + */ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return new ? KASAN_TAG_KERNEL : random_tag(); + return init ? KASAN_TAG_KERNEL : random_tag(); + /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB + /* For SLAB assign tags based on the object index in the freelist. */ return (u8)obj_to_index(cache, virt_to_page(object), (void *)object); #else - return new ? random_tag() : get_tag(object); + /* + * For SLUB assign a random tag during slab creation, otherwise reuse + * the already assigned tag. + */ + return init ? random_tag() : get_tag(object); #endif } @@ -386,17 +404,12 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, __memset(alloc_info, 0, sizeof(*alloc_info)); if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - object = set_tag(object, assign_tag(cache, object, true)); + object = set_tag(object, + assign_tag(cache, object, true, false)); return (void *)object; } -void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, - gfp_t flags) -{ - return kasan_kmalloc(cache, object, cache->object_size, flags); -} - static inline bool shadow_invalid(u8 tag, s8 shadow_byte) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) @@ -452,8 +465,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return __kasan_slab_free(cache, object, ip, true); } -void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags) +static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags, bool keep_tag) { unsigned long redzone_start; unsigned long redzone_end; @@ -471,7 +484,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_SHADOW_SCALE_SIZE); if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - tag = assign_tag(cache, object, false); + tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ kasan_unpoison_shadow(set_tag(object, tag), size); @@ -483,6 +496,18 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, return set_tag(object, tag); } + +void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, + gfp_t flags) +{ + return __kasan_kmalloc(cache, object, cache->object_size, flags, false); +} + +void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) +{ + return __kasan_kmalloc(cache, object, size, flags, true); +} EXPORT_SYMBOL(kasan_kmalloc); void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, @@ -522,7 +547,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) if (unlikely(!PageSlab(page))) return kasan_kmalloc_large(object, size, flags); else - return kasan_kmalloc(page->slab_cache, object, size, flags); + return __kasan_kmalloc(page->slab_cache, object, size, + flags, true); } void kasan_poison_kfree(void *ptr, unsigned long ip) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 0777649e07c4..63fca3172659 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -46,7 +46,7 @@ void kasan_init_tags(void) int cpu; for_each_possible_cpu(cpu) - per_cpu(prng_state, cpu) = get_random_u32(); + per_cpu(prng_state, cpu) = (u32)get_cycles(); } /* diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f9d9dc250428..707fa5579f66 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, unsigned long flags; struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, write_lock_irqsave(&kmemleak_lock, flags); - min_addr = min(min_addr, ptr); - max_addr = max(max_addr, ptr + size); + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + min_addr = min(min_addr, untagged_ptr); + max_addr = max(max_addr, untagged_ptr + size); link = &object_tree_root.rb_node; rb_parent = NULL; while (*link) { @@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end, unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); unsigned long flags; + unsigned long untagged_ptr; read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { @@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - if (pointer < min_addr || pointer >= max_addr) + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) continue; /* diff --git a/mm/memblock.c b/mm/memblock.c index 022d4cbb3618..ea31045ba704 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -26,6 +26,13 @@ #include "internal.h" +#define INIT_MEMBLOCK_REGIONS 128 +#define INIT_PHYSMEM_REGIONS 4 + +#ifndef INIT_MEMBLOCK_RESERVED_REGIONS +# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS +#endif + /** * DOC: memblock overview * @@ -92,7 +99,7 @@ unsigned long max_pfn; unsigned long long max_possible_pfn; static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; -static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; #endif @@ -105,7 +112,7 @@ struct memblock memblock __initdata_memblock = { .reserved.regions = memblock_reserved_init_regions, .reserved.cnt = 1, /* empty dummy entry */ - .reserved.max = INIT_MEMBLOCK_REGIONS, + .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6379fff1a5ff..831be5ff5f4d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, if (fail || tk->addr_valid == 0) { pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, + tk->tsk, PIDTYPE_PID); } /* @@ -966,7 +967,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - bool unmap_success = true; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); @@ -1028,19 +1029,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - if (!PageHuge(hpage)) { - unmap_success = try_to_unmap(hpage, ttu); - } else if (mapping) { - /* - * For hugetlb pages, try_to_unmap could potentially call - * huge_pmd_unshare. Because of this, take semaphore in - * write mode here and set TTU_RMAP_LOCKED to indicate we - * have taken the lock at this higer level. - */ - i_mmap_lock_write(mapping); - unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } + unmap_success = try_to_unmap(hpage, ttu); if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); diff --git a/mm/memory.c b/mm/memory.c index a52663c0612d..e11ca9dd823f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_pne + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) @@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - range->start = address & PAGE_MASK; - range->end = range->start + PAGE_SIZE; + mmu_notifier_range_init(range, mm, address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b9a667d36c55..1ad28323fb9f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1188,11 +1188,13 @@ static inline int pageblock_free(struct page *page) return PageBuddy(page) && page_order(page) >= pageblock_order; } -/* Return the start of the next active pageblock after a given page */ -static struct page *next_active_pageblock(struct page *page) +/* Return the pfn of the start of the next active pageblock after a given pfn */ +static unsigned long next_active_pageblock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); + /* Ensure the starting page is pageblock-aligned */ - BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + BUG_ON(pfn & (pageblock_nr_pages - 1)); /* If the entire pageblock is free, move to the end of free page */ if (pageblock_free(page)) { @@ -1200,16 +1202,16 @@ static struct page *next_active_pageblock(struct page *page) /* be careful. we don't have locks, page_order can be changed.*/ order = page_order(page); if ((order < MAX_ORDER) && (order >= pageblock_order)) - return page + (1 << order); + return pfn + (1 << order); } - return page + pageblock_nr_pages; + return pfn + pageblock_nr_pages; } -static bool is_pageblock_removable_nolock(struct page *page) +static bool is_pageblock_removable_nolock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); struct zone *zone; - unsigned long pfn; /* * We have to be careful here because we are iterating over memory @@ -1232,12 +1234,14 @@ static bool is_pageblock_removable_nolock(struct page *page) /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - struct page *page = pfn_to_page(start_pfn); - struct page *end_page = page + nr_pages; + unsigned long end_pfn, pfn; + + end_pfn = min(start_pfn + nr_pages, + zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); /* Check the starting page of each pageblock within the range */ - for (; page < end_page; page = next_active_pageblock(page)) { - if (!is_pageblock_removable_nolock(page)) + for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { + if (!is_pageblock_removable_nolock(pfn)) return false; cond_resched(); } @@ -1273,6 +1277,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; + /* Check if we got outside of the zone */ + if (zone && !zone_spans_pfn(zone, pfn + i)) + return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; @@ -1301,23 +1308,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; - struct page *page; + for (pfn = start; pfn < end; pfn++) { - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageLRU(page)) - return pfn; - if (__PageMovable(page)) - return pfn; - if (PageHuge(page)) { - if (hugepage_migration_supported(page_hstate(page)) && - page_huge_active(page)) - return pfn; - else - pfn = round_up(pfn + 1, - 1 << compound_order(page)) - 1; - } - } + struct page *page, *head; + unsigned long skip; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + if (__PageMovable(page)) + return pfn; + + if (!PageHuge(page)) + continue; + head = compound_head(page); + if (hugepage_migration_supported(page_hstate(head)) && + page_huge_active(head)) + return pfn; + skip = (1 << compound_order(head)) - (page - head); + pfn += skip - 1; } return 0; } @@ -1344,7 +1355,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; - int not_managed = 0; int ret = 0; LIST_HEAD(source); @@ -1392,7 +1402,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ - put_page(page); list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + @@ -1401,22 +1410,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); - put_page(page); - /* Because we don't have big zone->lock. we should - check this again here. */ - if (page_count(page)) { - not_managed++; - ret = -EBUSY; - break; - } } + put_page(page); } if (!list_empty(&source)) { - if (not_managed) { - putback_movable_pages(&source); - goto out; - } - /* Allocate a new page from the nearest neighbor node */ ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); @@ -1429,7 +1426,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) putback_movable_pages(&source); } } -out: + return ret; } @@ -1576,7 +1573,6 @@ static int __ref __offline_pages(unsigned long start_pfn, we assume this for now. .*/ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) { - mem_hotplug_done(); ret = -EINVAL; reason = "multizone range"; goto failed_removal; @@ -1591,7 +1587,6 @@ static int __ref __offline_pages(unsigned long start_pfn, MIGRATE_MOVABLE, SKIP_HWPOISON | REPORT_FAILURE); if (ret) { - mem_hotplug_done(); reason = "failure to isolate range"; goto failed_removal; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d4496d9d34f5..ee2bce59d2bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); @@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) diff --git a/mm/migrate.c b/mm/migrate.c index ccf8966caf6f..d4fd680be3b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, /* Simple case, sync compaction */ if (mode != MIGRATE_ASYNC) { do { - get_bh(bh); lock_buffer(bh); bh = bh->b_this_page; @@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, /* async case, we cannot block on lock_buffer so use trylock_buffer */ do { - get_bh(bh); if (!trylock_buffer(bh)) { /* * We failed to lock the buffer and cannot stall in * async migration. Release the taken locks */ struct buffer_head *failed_bh = bh; - put_bh(failed_bh); bh = head; while (bh != failed_bh) { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } return false; @@ -818,7 +814,6 @@ unlock_buffers: bh = head; do { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -1135,10 +1130,13 @@ out: * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. + * list in here. Use the old state of the isolated source page to + * determine if we migrated a LRU page. newpage was already unlocked + * and possibly modified by its owner - don't rely on the page + * state. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(__PageMovable(newpage))) + if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); @@ -1324,19 +1322,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, goto put_anon; if (page_mapped(hpage)) { - struct address_space *mapping = page_mapping(hpage); - - /* - * try_to_unmap could potentially call huge_pmd_unshare. - * Because of this, take semaphore in write mode here and - * set TTU_RMAP_LOCKED to let lower levels know we have - * taken the lock. - */ - i_mmap_lock_write(mapping); try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| - TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } diff --git a/mm/mincore.c b/mm/mincore.c index f0f91461a9f4..218099b5ed31 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -42,14 +42,72 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, return 0; } -static int mincore_unmapped_range(unsigned long addr, unsigned long end, - struct mm_walk *walk) +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +{ + unsigned char present = 0; + struct page *page; + + /* + * When tmpfs swaps out a page from a file, any process mapping that + * file will not get a swp_entry_t in its pte, but rather it is like + * any other file mapping (ie. marked !present and faulted in with + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. + */ +#ifdef CONFIG_SWAP + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (xa_is_value(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); +#endif + if (page) { + present = PageUptodate(page); + put_page(page); + } + + return present; +} + +static int __mincore_unmapped_range(unsigned long addr, unsigned long end, + struct vm_area_struct *vma, unsigned char *vec) { - unsigned char *vec = walk->private; unsigned long nr = (end - addr) >> PAGE_SHIFT; + int i; - memset(vec, 0, nr); - walk->private += nr; + if (vma->vm_file) { + pgoff_t pgoff; + + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } + return nr; +} + +static int mincore_unmapped_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + walk->private += __mincore_unmapped_range(addr, end, + walk->vma, walk->private); return 0; } @@ -69,9 +127,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - /* We'll consider a THP page under construction to be there */ if (pmd_trans_unstable(pmd)) { - memset(vec, 1, nr); + __mincore_unmapped_range(addr, end, vma, vec); goto out; } @@ -80,17 +137,28 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t pte = *ptep; if (pte_none(pte)) - *vec = 0; + __mincore_unmapped_range(addr, addr + PAGE_SIZE, + vma, vec); else if (pte_present(pte)) *vec = 1; else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); - /* - * migration or hwpoison entries are always - * uptodate - */ - *vec = !!non_swap_entry(entry); + if (non_swap_entry(entry)) { + /* + * migration or hwpoison entries are always + * uptodate + */ + *vec = 1; + } else { +#ifdef CONFIG_SWAP + *vec = mincore_page(swap_address_space(entry), + swp_offset(entry)); +#else + WARN_ON(1); + *vec = 1; +#endif + } } vec++; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f0e8cd9edb1a..26ea8636758f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -647,8 +647,8 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); @@ -975,6 +975,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * still freeing memory. */ read_lock(&tasklist_lock); + + /* + * The task 'p' might have already exited before reaching here. The + * put_task_struct() will free task_struct 'p' while the loop still try + * to access the field of 'p', so, get an extra reference. + */ + get_task_struct(p); for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -994,6 +1001,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } } } + put_task_struct(p); read_unlock(&tasklist_lock); /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cde5dac6229a..0b9f577b1a2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2170,6 +2170,18 @@ static inline void boost_watermark(struct zone *zone) max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return; + max_boost = max(pageblock_nr_pages, max_boost); zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, @@ -2214,7 +2226,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, */ boost_watermark(zone); if (alloc_flags & ALLOC_KSWAPD) - wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ if (!whole_block) @@ -3102,6 +3114,12 @@ struct page *rmqueue(struct zone *preferred_zone, local_irq_restore(flags); out: + /* Separate test+clear to avoid unnecessary atomics */ + if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + } + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; @@ -4669,11 +4687,11 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size - 1); + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->offset = size; } @@ -4689,10 +4707,10 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size); + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; } @@ -5695,18 +5713,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, cond_resched(); } } -#ifdef CONFIG_SPARSEMEM - /* - * If the zone does not span the rest of the section then - * we should at least initialize those pages. Otherwise we - * could blow up on a poisoned page in some paths which depend - * on full sections being initialized (e.g. memory hotplug). - */ - while (end_pfn % PAGES_PER_SECTION) { - __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid); - end_pfn++; - } -#endif } #ifdef CONFIG_ZONE_DEVICE diff --git a/mm/page_ext.c b/mm/page_ext.c index ae44f7adbe07..8c78b8d45117 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -398,10 +398,8 @@ void __init page_ext_init(void) * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... - * - * Take into account DEFERRED_STRUCT_PAGE_INIT. */ - if (early_pfn_to_nid(pfn) != nid) + if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; diff --git a/mm/rmap.c b/mm/rmap.c index 21a26cf51114..0454ecc29537 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -25,7 +25,6 @@ * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone_lru_lock (in mark_page_accessed, isolate_lru_page) @@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start, - min(vma->vm_end, vma->vm_start + + mmu_notifier_range_init(&range, vma->vm_mm, address, + min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted * accordingly. - * - * If called for a huge page, caller must hold i_mmap_rwsem - * in write mode as it is possible to call huge_pmd_unshare. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); diff --git a/mm/shmem.c b/mm/shmem.c index 6ece1e2fe76e..0905215fb016 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2854,10 +2854,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); diff --git a/mm/slab.c b/mm/slab.c index 73fe23e649c9..91c1863df93d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, struct alien_cache *alc = NULL; alc = kmalloc_node(memsize, gfp, node); - init_arraycache(&alc->ac, entries, batch); - spin_lock_init(&alc->lock); + if (alc) { + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + } return alc; } @@ -2357,7 +2359,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - page->s_mem = kasan_reset_tag(addr) + colour_off; + page->s_mem = addr + colour_off; page->active = 0; if (OBJFREELIST_SLAB(cachep)) @@ -2366,6 +2368,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); + freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2679,6 +2682,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, offset *= cachep->colour_off; + /* + * Call kasan_poison_slab() before calling alloc_slabmgmt(), so + * page_address() in the latter returns a non-tagged pointer, + * as it should be for slab pages. + */ + kasan_poison_slab(page); + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, page_node); @@ -2687,7 +2697,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); - kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -3538,7 +3547,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - ret = kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3628,7 +3636,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - ret = kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -4406,6 +4413,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int objnr; unsigned long offset; + ptr = kasan_reset_tag(ptr); + /* Find and validate object. */ cachep = page->slab_cache; objnr = obj_to_index(cachep, page, (void *)ptr); diff --git a/mm/slab.h b/mm/slab.h index 4190c24ef0e9..384105318779 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -437,11 +437,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemleak_alloc_recursive(object, s->object_size, 1, + p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); - p[i] = kasan_slab_alloc(s, object, flags); } if (memcg_kmem_enabled()) diff --git a/mm/slab_common.c b/mm/slab_common.c index 81732d05e74a..f9d89c1b5977 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1228,8 +1228,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) flags |= __GFP_COMP; page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; - kmemleak_alloc(ret, size, 1, flags); ret = kasan_kmalloc_large(ret, size, flags); + /* As ret might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ret, size, 1, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); diff --git a/mm/slub.c b/mm/slub.c index 36c0befeebd8..dc777761b6b7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) { #ifdef CONFIG_SLAB_FREELIST_HARDENED - return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); + /* + * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * Normally, this doesn't cause any issues, as both set_freepointer() + * and get_freepointer() are called with a pointer with the same tag. + * However, there are some issues with CONFIG_SLUB_DEBUG code. For + * example, when __free_slub() iterates over objects in a cache, it + * passes untagged pointers to check_object(). check_object() in turns + * calls get_freepointer() with an untagged pointer, which causes the + * freepointer to be restored incorrectly. + */ + return (void *)((unsigned long)ptr ^ s->random ^ + (unsigned long)kasan_reset_tag((void *)ptr_addr)); #else return ptr; #endif @@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p < (__addr) + (__objects) * (__s)->size; \ __p += (__s)->size) -#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr), __idx = 1; \ - __idx <= __objects; \ - __p += (__s)->size, __idx++) - /* Determine object index from a given position */ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { - return (p - addr) / s->size; + return (kasan_reset_tag(p) - addr) / s->size; } static inline unsigned int order_objects(unsigned int order, unsigned int size) @@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); + object = kasan_reset_tag(object); object = restore_red_left(s, object); if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { @@ -1075,6 +1082,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } +static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +{ + if (!(s->flags & SLAB_POISON)) + return; + + metadata_access_enable(); + memset(addr, POISON_INUSE, PAGE_SIZE << order); + metadata_access_disable(); +} + static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) @@ -1330,6 +1347,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} +static inline void setup_page_debug(struct kmem_cache *s, + void *addr, int order) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1374,8 +1393,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, */ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); - return kasan_kmalloc_large(ptr, size, flags); + return ptr; } static __always_inline void kfree_hook(void *x) @@ -1641,27 +1662,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - start = page_address(page); + kasan_poison_slab(page); - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << order); + start = page_address(page); - kasan_poison_slab(page); + setup_page_debug(s, start, order); shuffle = shuffle_freelist(s, page); if (!shuffle) { - for_each_object_idx(p, idx, s, start, page->objects) { - if (likely(idx < page->objects)) { - next = p + s->size; - next = setup_object(s, page, next); - set_freepointer(s, p, next); - } else - set_freepointer(s, p, NULL); - } start = fixup_red_left(s, start); start = setup_object(s, page, start); page->freelist = start; + for (idx = 0, p = start; idx < page->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, page, next); + set_freepointer(s, p, next); + p = next; + } + set_freepointer(s, p, NULL); } page->inuse = page->objects; @@ -3846,6 +3865,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int offset; size_t object_size; + ptr = kasan_reset_tag(ptr); + /* Find object and usable object size. */ s = page->slab_cache; diff --git a/mm/swap.c b/mm/swap.c index 4929bc1be60e..4d7d37eb3c40 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu) { } -static bool need_activate_page_drain(int cpu) -{ - return false; -} - void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -653,13 +648,15 @@ void lru_add_drain(void) put_cpu(); } +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); } -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); - /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is @@ -702,6 +699,12 @@ void lru_add_drain_all(void) mutex_unlock(&lock); } +#else +void lru_add_drain_all(void) +{ + lru_add_drain(); +} +#endif /** * release_pages - batched put_page() diff --git a/mm/usercopy.c b/mm/usercopy.c index 852eb4e53f06..14faadcedd06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address - * - known-safe heap or stack object + * - fully contained by stack (or stack frame, when available) + * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) @@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); - /* Check for bad heap object. */ - check_heap_object(ptr, n, to_user); - /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: @@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) usercopy_abort("process stack", NULL, to_user, 0, n); } + /* Check for bad heap object. */ + check_heap_object(ptr, n, to_user); + /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 065c1ce191c4..d59b5a73dfb3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -267,14 +267,10 @@ retry: VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even - * in the case of shared pmds. fault mutex prevents - * races with other faulting threads. + * Serialize via hugetlb_fault_mutex */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -283,7 +279,6 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -291,7 +286,6 @@ retry: dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -299,7 +293,6 @@ retry: dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); vm_alloc_shared = vm_shared; cond_resched(); diff --git a/mm/util.c b/mm/util.c index 4df23d64aac7..379319b1bcfd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -150,7 +150,7 @@ void *memdup_user(const void __user *src, size_t len) { void *p; - p = kmalloc_track_caller(len, GFP_USER); + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a714c4f800e9..e979705bbf32 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } - /* - * Make sure we apply some minimal pressure on default priority - * even on small cgroups. Stale objects are not only consuming memory - * by themselves, but can also hold a reference to a dying cgroup, - * preventing it from being reclaimed. A dying cgroup with all - * corresponding structures like per-cpu stats and kmem caches - * can be really big, so it may lead to a significant waste of memory. - */ - delta = max_t(unsigned long long, delta, min(freeable, batch_size)); - total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |