diff options
Diffstat (limited to 'mm')
37 files changed, 728 insertions, 257 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 43700480d897..151090fdcf29 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1433,6 +1433,67 @@ void unlock_page(struct page *page) EXPORT_SYMBOL(unlock_page); /** + * end_page_private_2 - Clear PG_private_2 and release any waiters + * @page: The page + * + * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for + * this. The page ref held for PG_private_2 being set is released. + * + * This is, for example, used when a netfs page is being written to a local + * disk cache, thereby allowing writes to the cache for the same page to be + * serialised. + */ +void end_page_private_2(struct page *page) +{ + page = compound_head(page); + VM_BUG_ON_PAGE(!PagePrivate2(page), page); + clear_bit_unlock(PG_private_2, &page->flags); + wake_up_page_bit(page, PG_private_2); + put_page(page); +} +EXPORT_SYMBOL(end_page_private_2); + +/** + * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page. + */ +void wait_on_page_private_2(struct page *page) +{ + page = compound_head(page); + while (PagePrivate2(page)) + wait_on_page_bit(page, PG_private_2); +} +EXPORT_SYMBOL(wait_on_page_private_2); + +/** + * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a + * fatal signal is received by the calling task. + * + * Return: + * - 0 if successful. + * - -EINTR if a fatal signal was encountered. + */ +int wait_on_page_private_2_killable(struct page *page) +{ + int ret = 0; + + page = compound_head(page); + while (PagePrivate2(page)) { + ret = wait_on_page_bit_killable(page, PG_private_2); + if (ret < 0) + break; + } + + return ret; +} +EXPORT_SYMBOL(wait_on_page_private_2_killable); + +/** * end_page_writeback - end writeback against a page * @page: the page */ @@ -1969,8 +2030,14 @@ unlock: put: put_page(page); next: - if (!xa_is_value(page) && PageTransHuge(page)) - xas_set(&xas, page->index + thp_nr_pages(page)); + if (!xa_is_value(page) && PageTransHuge(page)) { + unsigned int nr_pages = thp_nr_pages(page); + + /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ + xas_set(&xas, page->index + nr_pages); + if (xas.xa_index < nr_pages) + break; + } } rcu_read_unlock(); @@ -2672,7 +2739,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); - pgoff_t max = (end - 1) / PAGE_SIZE; + pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct page *page; @@ -2681,7 +2748,8 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, rcu_read_lock(); while ((page = find_get_entry(&xas, max, XA_PRESENT))) { - loff_t pos = xas.xa_index * PAGE_SIZE; + loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; + unsigned int seek_size; if (start < pos) { if (!seek_data) @@ -2689,25 +2757,25 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - pos += seek_page_size(&xas, page); + seek_size = seek_page_size(&xas, page); + pos = round_up(pos + 1, seek_size); start = page_seek_hole_data(&xas, mapping, page, start, pos, seek_data); if (start < pos) goto unlock; + if (start >= end) + break; + if (seek_size > PAGE_SIZE) + xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(page)) put_page(page); } - rcu_read_unlock(); - if (seek_data) - return -ENXIO; - goto out; - + start = -ENXIO; unlock: rcu_read_unlock(); - if (!xa_is_value(page)) + if (page && !xa_is_value(page)) put_page(page); -out: if (start > end) return end; return start; @@ -2771,7 +2839,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; - DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); + DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; @@ -2783,7 +2851,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra, ra->ra_pages); + page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } @@ -1535,6 +1535,10 @@ struct page *get_dump_page(unsigned long addr) FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); + + if (ret == 1 && is_page_poisoned(page)) + return NULL; + return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ diff --git a/mm/highmem.c b/mm/highmem.c index 874b732b120c..6ef8f5e05e7e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + if (start1 >= end1) + start1 = end1 = 0; + if (start2 >= end2) + start2 = end2 = 0; + for (i = 0; i < compound_nr(page); i++) { void *kaddr = NULL; - if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) - kaddr = kmap_atomic(page + i); - if (start1 >= PAGE_SIZE) { start1 -= PAGE_SIZE; end1 -= PAGE_SIZE; } else { unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); - if (end1 > start1) + if (end1 > start1) { + kaddr = kmap_atomic(page + i); memset(kaddr + start1, 0, this_end - start1); + } end1 -= this_end; start1 = 0; } @@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } else { unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); - if (end2 > start2) + if (end2 > start2) { + if (!kaddr) + kaddr = kmap_atomic(page + i); memset(kaddr + start2, 0, this_end - start2); + } end2 -= this_end; start2 = 0; } @@ -611,7 +618,7 @@ void __kmap_local_sched_out(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } @@ -647,7 +654,7 @@ void __kmap_local_sched_in(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 395c75111d33..ae907a9c2050 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * best effort that the pinned pages won't be replaced by another * random page during the coming copy-on-write. */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(src_page))) { + if (unlikely(page_needs_cow_for_dma(vma, src_page))) { pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, } /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(pud_page(pud)))) { + if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pud(vma, src_pud, addr); @@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, int i; /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(head); + split_page_memcg(head, nr); if (PageAnon(head) && PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8fb42c6dd74b..a86a58ef132d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -280,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; + /* + * The caller will hold exactly one h_cg->css reference for the + * whole contiguous reservation region. But this area might be + * scattered when there are already some file_regions reside in + * it. As a result, many file_regions may share only one css + * reference. In order to ensure that one file_region must hold + * exactly one h_cg->css reference, we should do css_get for + * each file_region and leave the reference held by caller + * untouched. + */ + css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in @@ -293,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, #endif } +static void put_uncharge_info(struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (rg->css) + css_put(rg->css); +#endif +} + static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { @@ -316,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) prg->to = rg->to; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); rg = prg; @@ -327,10 +347,29 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) nrg->from = rg->from; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); } } +static inline long +hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, + long to, struct hstate *h, struct hugetlb_cgroup *cg, + long *regions_needed) +{ + struct file_region *nrg; + + if (!regions_needed) { + nrg = get_file_region_entry_from_cache(map, from, to); + record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); + list_add(&nrg->link, rg->link.prev); + coalesce_file_region(map, nrg); + } else + *regions_needed += 1; + + return to - from; +} + /* * Must be called with resv->lock held. * @@ -346,7 +385,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; - struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + struct file_region *rg = NULL, *trg = NULL; if (regions_needed) *regions_needed = 0; @@ -369,24 +408,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* When we find a region that starts beyond our range, we've * finished. */ - if (rg->from > t) + if (rg->from >= t) break; /* Add an entry for last_accounted_offset -> rg->from, and * update last_accounted_offset. */ - if (rg->from > last_accounted_offset) { - add += rg->from - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, rg->from); - record_hugetlb_cgroup_uncharge_info(h_cg, h, - resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (rg->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, rg, + last_accounted_offset, + rg->from, h, h_cg, + regions_needed); last_accounted_offset = rg->to; } @@ -394,17 +426,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* Handle the case where our range extends beyond * last_accounted_offset. */ - if (last_accounted_offset < t) { - add += t - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, t); - record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (last_accounted_offset < t) + add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, + t, h, h_cg, regions_needed); VM_BUG_ON(add < 0); return add; @@ -659,7 +683,7 @@ retry: del += t - f; hugetlb_cgroup_uncharge_file_region( - resv, rg, t - f); + resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; @@ -680,7 +704,7 @@ retry: if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - rg->from); + rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; @@ -688,13 +712,13 @@ retry: if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - t - rg->from); + t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - f); + rg->to - f, false); del += rg->to - f; rg->to = f; @@ -3725,21 +3749,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) return false; } +static void +hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct page *new_page) +{ + __SetPageUptodate(new_page); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + hugepage_add_new_anon_rmap(new_page, vma, addr); + hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); + ClearHPageRestoreReserve(new_page); + SetHPageMigratable(new_page); +} + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - int cow; + bool cow = is_cow_mapping(vma->vm_flags); struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, @@ -3784,6 +3819,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); dst_entry = huge_ptep_get(dst_pte); +again: if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { /* * Skip if src entry none. Also, skip in the @@ -3807,6 +3843,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + + /* + * This is a rare case where we see pinned hugetlb + * pages while they're prone to COW. We need to do the + * COW earlier during fork. + * + * When pre-allocating the page or copying data, we + * need to be without the pgtable locks since we could + * sleep during the process. + */ + if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + pte_t src_pte_old = entry; + struct page *new; + + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + /* Do not use reserve as it's private owned */ + new = alloc_huge_page(vma, addr, 1); + if (IS_ERR(new)) { + put_page(ptepage); + ret = PTR_ERR(new); + break; + } + copy_user_huge_page(new, ptepage, addr, vma, + npages); + put_page(ptepage); + + /* Install the new huge page if src pte stable */ + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (!pte_same(src_pte_old, entry)) { + put_page(new); + /* dst_entry won't change as in child */ + goto again; + } + hugetlb_install_page(vma, dst_pte, addr, new); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; + } + if (cow) { /* * No need to notify as we are downgrading page @@ -3817,12 +3899,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ huge_ptep_set_wrprotect(src, addr, src_pte); } - entry = huge_ptep_get(src_pte); - ptepage = pte_page(entry); - get_page(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(pages_per_huge_page(h), dst); + hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -5128,6 +5208,10 @@ bool hugetlb_reserve_pages(struct inode *inode, */ long rsv_adjust; + /* + * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the + * reference to h_cg->css. See comment below for detail. + */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); @@ -5135,6 +5219,14 @@ bool hugetlb_reserve_pages(struct inode *inode, rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); + } else if (h_cg) { + /* + * The file_regions will hold their own reference to + * h_cg->css. So we should release the reference held + * via hugetlb_cgroup_charge_cgroup_rsvd() when we are + * done. + */ + hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f68b51fcda3d..603a131e262d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, - unsigned long nr_pages) + unsigned long nr_pages, + bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; @@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); - css_put(rg->css); + /* + * Only do css_put(rg->css) when we delete the entire region + * because one file_region must hold exactly one css reference. + */ + if (region_del) + css_put(rg->css); } } diff --git a/mm/internal.h b/mm/internal.h index 9902648f2206..bbe900f9f095 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb, void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); -void force_page_cache_ra(struct readahead_control *, struct file_ra_state *, - unsigned long nr); +void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { - DEFINE_READAHEAD(ractl, file, mapping, index); - force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); + force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, @@ -97,6 +96,26 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * When kernel touch the user page, the user page may be have been marked + * poison but still mapped in user space, if without this page, the kernel + * can guarantee the data integrity and operation success, the kernel is + * better to check the posion status and avoid touching it, be good not to + * panic, coredump for process fatal signal is a sample case matching this + * scenario. Or if kernel can't guarantee the data integrity, it's better + * not to call this function, let kernel touch the poison page and get to + * panic. + */ +static inline bool is_page_poisoned(struct page *page) +{ + if (PageHWPoison(page)) + return true; + else if (PageHuge(page) && PageHWPoison(compound_head(page))) + return true; + + return false; +} + extern unsigned long highest_memmap_pfn; /* @@ -296,11 +315,6 @@ static inline unsigned int buddy_order(struct page *page) */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * These three helpers classifies VMAs for virtual memory accounting. */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b5e08d4cefec..7b53291dafa1 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -63,7 +63,7 @@ void __kasan_unpoison_range(const void *address, size_t size) kasan_unpoison(address, size); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK /* Unpoison the entire stack for a task. */ void kasan_unpoison_task_stack(struct task_struct *task) { diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2aad21fda156..4004388b4e4b 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -25,6 +25,12 @@ enum kasan_arg { KASAN_ARG_ON, }; +enum kasan_arg_mode { + KASAN_ARG_MODE_DEFAULT, + KASAN_ARG_MODE_SYNC, + KASAN_ARG_MODE_ASYNC, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -38,6 +44,7 @@ enum kasan_arg_fault { }; static enum kasan_arg kasan_arg __ro_after_init; +static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; static enum kasan_arg_fault kasan_arg_fault __ro_after_init; @@ -45,6 +52,10 @@ static enum kasan_arg_fault kasan_arg_fault __ro_after_init; DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); +/* Whether the asynchronous mode is enabled. */ +bool kasan_flag_async __ro_after_init; +EXPORT_SYMBOL_GPL(kasan_flag_async); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); @@ -68,6 +79,23 @@ static int __init early_kasan_flag(char *arg) } early_param("kasan", early_kasan_flag); +/* kasan.mode=sync/async */ +static int __init early_kasan_mode(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "sync")) + kasan_arg_mode = KASAN_ARG_MODE_SYNC; + else if (!strcmp(arg, "async")) + kasan_arg_mode = KASAN_ARG_MODE_ASYNC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.mode", early_kasan_mode); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -115,7 +143,15 @@ void kasan_init_hw_tags_cpu(void) return; hw_init_tags(KASAN_TAG_MAX); - hw_enable_tagging(); + + /* + * Enable async mode only when explicitly requested through + * the command line. + */ + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else + hw_enable_tagging_sync(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -132,6 +168,22 @@ void __init kasan_init_hw_tags(void) /* Enable KASAN. */ static_branch_enable(&kasan_flag_enabled); + switch (kasan_arg_mode) { + case KASAN_ARG_MODE_DEFAULT: + /* + * Default to sync mode. + * Do nothing, kasan_flag_async keeps its default value. + */ + break; + case KASAN_ARG_MODE_SYNC: + /* Do nothing, kasan_flag_async keeps its default value. */ + break; + case KASAN_ARG_MODE_ASYNC: + /* Async mode enabled. */ + kasan_flag_async = true; + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default to enabling stack trace collection. */ @@ -194,10 +246,16 @@ void kasan_set_tagging_report_once(bool state) } EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); -void kasan_enable_tagging(void) +void kasan_enable_tagging_sync(void) +{ + hw_enable_tagging_sync(); +} +EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); + +void kasan_force_async_fault(void) { - hw_enable_tagging(); + hw_force_async_tag_fault(); } -EXPORT_SYMBOL_GPL(kasan_enable_tagging); +EXPORT_SYMBOL_GPL(kasan_force_async_fault); #endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8c55634d6edd..c1581e8a9b8e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,20 +7,37 @@ #include <linux/stackdepot.h> #ifdef CONFIG_KASAN_HW_TAGS + #include <linux/static_key.h> + DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +extern bool kasan_flag_async __ro_after_init; + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); } + +static inline bool kasan_async_mode_enabled(void) +{ + return kasan_flag_async; +} #else + static inline bool kasan_stack_collection_enabled(void) { return true; } + +static inline bool kasan_async_mode_enabled(void) +{ + return false; +} + #endif extern bool kasan_flag_panic __ro_after_init; +extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -231,7 +248,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size); const char *kasan_get_bug_type(struct kasan_access_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); -#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } @@ -275,8 +292,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifdef CONFIG_KASAN_HW_TAGS -#ifndef arch_enable_tagging -#define arch_enable_tagging() +#ifndef arch_enable_tagging_sync +#define arch_enable_tagging_sync() +#endif +#ifndef arch_enable_tagging_async +#define arch_enable_tagging_async() #endif #ifndef arch_init_tags #define arch_init_tags(max_tag) @@ -284,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_set_tagging_report_once #define arch_set_tagging_report_once(state) #endif +#ifndef arch_force_async_tag_fault +#define arch_force_async_tag_fault() +#endif #ifndef arch_get_random_tag #define arch_get_random_tag() (0xFF) #endif @@ -294,16 +317,19 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr)) #endif -#define hw_enable_tagging() arch_enable_tagging() +#define hw_enable_tagging_sync() arch_enable_tagging_sync() +#define hw_enable_tagging_async() arch_enable_tagging_async() #define hw_init_tags(max_tag) arch_init_tags(max_tag) #define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) +#define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) #else /* CONFIG_KASAN_HW_TAGS */ -#define hw_enable_tagging() +#define hw_enable_tagging_sync() +#define hw_enable_tagging_async() #define hw_set_tagging_report_once(state) #endif /* CONFIG_KASAN_HW_TAGS */ @@ -311,12 +337,14 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_set_tagging_report_once(bool state); -void kasan_enable_tagging(void); +void kasan_enable_tagging_sync(void); +void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ static inline void kasan_set_tagging_report_once(bool state) { } -static inline void kasan_enable_tagging(void) { } +static inline void kasan_enable_tagging_sync(void) { } +static inline void kasan_force_async_fault(void) { } #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 87b271206163..14bd51ea2348 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -87,7 +87,8 @@ static void start_report(unsigned long *flags) static void end_report(unsigned long *flags, unsigned long addr) { - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); + if (!kasan_async_mode_enabled()) + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -360,6 +361,25 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags, (unsigned long)object); } +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) +{ + unsigned long flags; + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + + start_report(&flags); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous mode enabled: no access details available\n"); + pr_err("\n"); + dump_stack(); + end_report(&flags, 0); +} +#endif /* CONFIG_KASAN_HW_TAGS */ + static void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 41f374585144..de732bc341c5 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -128,7 +128,7 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, unsigned long *value) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 3b8ec938470a..d53c91f881a4 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -12,6 +12,7 @@ #include <linux/debugfs.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> +#include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/memblock.h> @@ -480,6 +481,14 @@ static bool __init kfence_init_pool(void) addr += 2 * PAGE_SIZE; } + /* + * The pool is live and will never be deallocated from this point on. + * Remove the pool object from the kmemleak object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_free(__kfence_pool); + return true; err: diff --git a/mm/kfence/report.c b/mm/kfence/report.c index ab83d5a59bb1..e3f71451ad9e 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -20,6 +20,11 @@ #include "kfence.h" +/* May be overridden by <asm/kfence.h>. */ +#ifndef ARCH_FUNC_PREFIX +#define ARCH_FUNC_PREFIX "" +#endif + extern bool no_hash_pointers; /* Helper function to either print to a seq_file or to console. */ @@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries for (skipnr = 0; skipnr < num_entries; skipnr++) { int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || - !strncmp(buf, "__slab_free", len)) { + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { /* * In case of tail calls from any of the below * to any of the above. @@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries } /* Also the *_bulk() variants by only checking prefixes. */ - if (str_has_prefix(buf, "kfree") || - str_has_prefix(buf, "kmem_cache_free") || - str_has_prefix(buf, "__kmalloc") || - str_has_prefix(buf, "kmem_cache_alloc")) + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; } if (fallback < num_entries) @@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met lockdep_assert_held(&meta->lock); if (meta->state == KFENCE_OBJECT_UNUSED) { - seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); return; } seq_con_printf(seq, - "kfence-#%zd [0x%p-0x%p" + "kfence-#%td [0x%p-0x%p" ", size=%d, cache=%s] allocated by task %d:\n", meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); @@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); @@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r case KFENCE_ERROR_UAF: pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", + pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); pr_err("Corrupted memory at 0x%p ", (void *)address); print_diff_canary(address, 16, meta); - pr_cont(" (in kfence-#%zd):\n", object_index); + pr_cont(" (in kfence-#%td):\n", object_index); break; case KFENCE_ERROR_INVALID: pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), @@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, + pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, object_index); break; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c0014d3b91c1..fe6e3ae8e8c6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -97,6 +97,7 @@ #include <linux/atomic.h> #include <linux/kasan.h> +#include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -589,7 +590,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, atomic_set(&object->use_count, 1); object->flags = OBJECT_ALLOCATED; object->pointer = ptr; - object->size = size; + object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ diff --git a/mm/madvise.c b/mm/madvise.c index df692d2e35d4..01fef79ac761 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } - mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) { ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; goto release_task; } + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported. + */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { @@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, if (ret == 0) ret = total_len - iov_iter_count(&iter); +release_mm: mmput(mm); release_task: put_task_struct(task); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index b59054ef2e10..b890854ec761 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -165,10 +165,12 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, return 0; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD /* Huge pud */ walk->action = ACTION_CONTINUE; if (pud_trans_huge(pudval) || pud_devmap(pudval)) WARN_ON(pud_write(pudval) || pud_dirty(pudval)); +#endif return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845eec01ef9d..e064ac0d850a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Because page_memcg(head) is not set on compound tails, set it now. + * Because page_memcg(head) is not set on tails, set it now. */ -void mem_cgroup_split_huge_fixup(struct page *head) +void split_page_memcg(struct page *head, unsigned int nr) { struct mem_cgroup *memcg = page_memcg(head); int i; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < HPAGE_PMD_NR; i++) { - css_get(&memcg->css); - head[i].memcg_data = (unsigned long)memcg; - } + for (i = 1; i < nr; i++) + head[i].memcg_data = head->memcg_data; + css_get_many(&memcg->css, nr - 1); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP /** diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..550405fc3b5e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -166,7 +166,7 @@ static int __init init_zero_pfn(void) zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } -core_initcall(init_zero_pfn); +early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { @@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct page **prealloc, pte_t pte, struct page *page) { - struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(src_vma->vm_flags)) - return 1; - /* * What we want to do is to check whether this page may * have been pinned by the parent process. If so, @@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * the page count. That might give false positives for * for pinning, but it will work correctly. */ - if (likely(!atomic_read(&src_mm->has_pinned))) - return 1; - if (likely(!page_maybe_dma_pinned(page))) + if (likely(!page_needs_cow_for_dma(src_vma, page))) return 1; new_page = *prealloc; @@ -3103,6 +3097,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); } + /* + * Userfaultfd write-protect can defer flushes. Ensure the TLB + * is flushed in this case before copying. + */ + if (unlikely(userfaultfd_wp(vmf->vma) && + mm_tlb_flush_pending(vmf->vma->vm_mm))) + flush_tlb_page(vmf->vma, vmf->address); + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5ba51a8bdaeb..0cdbbfbc5757 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1072,7 +1072,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { - struct mhp_params params = { .pgprot = PAGE_KERNEL }; + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; u64 start, size; bool new_node = false; int ret; diff --git a/mm/mmap.c b/mm/mmap.c index 3f287599a7a3..1d96a21acb2f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,6 +93,12 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 0dc7149b0c61..1b9837419bf9 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -249,16 +249,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb_flush_mmu_free(tlb); } -/** - * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down - * @tlb: the mmu_gather structure to initialize - * @mm: the mm_struct of the target address space - * @fullmm: @mm is without users and we're going to destroy the full address - * space (exit/execve) - * - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. - */ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { @@ -283,11 +273,30 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, inc_tlb_flush_pending(tlb->mm); } +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } +/** + * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * In this case, @mm is without users and we're going to destroy the + * full address space (exit/execve). + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 61ee40ed804e..459d195d2ff6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start( ""); WARN_ON(mmu_notifier_range_blockable(range) || _ret != -EAGAIN); + /* + * We call all the notifiers on any EAGAIN, + * there is no way for a notifier to know if + * its start method failed, thus a start that + * does EAGAIN can't also do end. + */ + WARN_ON(ops->invalidate_range_end); ret = _ret; } } } + + if (ret) { + /* + * Must be non-blocking to get here. If there are multiple + * notifiers and one or more failed start, any that succeeded + * start are expecting their end to be called. Do so now. + */ + hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist, srcu_read_lock_held(&srcu)) { + if (!subscription->ops->invalidate_range_end) + continue; + + subscription->ops->invalidate_range_end(subscription, + range); + } + } srcu_read_unlock(&srcu, id); return ret; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9efaf430cfd3..fa1cf18bac97 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -170,7 +170,7 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/** +/* * Check whether unreclaimable slab amount is greater than * all user memory(LRU pages). * dump_unreclaimable_slab() could help in the case that diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb34d204d4ee..9e35b636a393 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2833,6 +2833,22 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); +/* + * Wait for a page to complete writeback. Returns -EINTR if we get a + * fatal signal while waiting. + */ +int wait_on_page_writeback_killable(struct page *page) +{ + while (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + if (wait_on_page_bit_killable(page, PG_writeback)) + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e4b29ee2b1e..e2f19bf948db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -167,10 +167,10 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_FALSE(init_on_alloc); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); -DEFINE_STATIC_KEY_FALSE(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); static bool _init_on_alloc_enabled_early __read_mostly @@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_prepare(struct page *page, kernel_poison_pages(page, 1 << order); /* + * With hardware tag-based KASAN, memory tags must be set before the + * page becomes unavailable via debug_pagealloc or arch_free_page. + */ + kasan_free_nondeferred_pages(page, order); + + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should * happen after this. @@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_pagealloc_unmap_pages(page, 1 << order); - kasan_free_nondeferred_pages(page, order); - return true; } @@ -3310,6 +3314,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -6259,12 +6264,65 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init_zone(). + * + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - physical memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone and node links point to zone and node that span the page if the + * hole is in the middle of a zone + * - zone and node links point to adjacent zone/node if the hole falls on + * the zone boundary; the pages in such holes will be prepended to the + * zone/node above the hole except for the trailing pages in the last + * section that will be appended to the zone/node below. + */ +static u64 __meminit init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + __init_single_page(pfn_to_page(pfn), pfn, zone, node); + __SetPageReserved(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} +#else +static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, + int zone, int node) +{ + return 0; +} +#endif + void __meminit __weak memmap_init_zone(struct zone *zone) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); + static unsigned long hole_pfn; unsigned long start_pfn, end_pfn; + u64 pgcnt = 0; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); @@ -6274,7 +6332,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone) memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (hole_pfn < start_pfn) + pgcnt += init_unavailable_range(hole_pfn, start_pfn, + zone_id, nid); + hole_pfn = end_pfn; } + +#ifdef CONFIG_SPARSEMEM + /* + * Initialize the hole in the range [zone_end_pfn, section_end]. + * If zone boundary falls in the middle of a section, this hole + * will be re-initialized during the call to this function for the + * higher zone. + */ + end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + if (hole_pfn < end_pfn) + pgcnt += init_unavailable_range(hole_pfn, end_pfn, + zone_id, nid); +#endif + + if (pgcnt) + pr_info(" %s zone: %llu pages in unavailable ranges\n", + zone->name, pgcnt); } static int zone_batchsize(struct zone *zone) @@ -7071,88 +7151,6 @@ void __init free_area_init_memoryless_node(int nid) free_area_init_node(nid); } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) -/* - * Initialize all valid struct pages in the range [spfn, epfn) and mark them - * PageReserved(). Return the number of struct pages that were initialized. - */ -static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) -{ - unsigned long pfn; - u64 pgcnt = 0; - - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - /* - * Use a fake node/zone (0) for now. Some of these pages - * (in memblock.reserved but not in memblock.memory) will - * get re-initialized via reserve_bootmem_region() later. - */ - __init_single_page(pfn_to_page(pfn), pfn, 0, 0); - __SetPageReserved(pfn_to_page(pfn)); - pgcnt++; - } - - return pgcnt; -} - -/* - * Only struct pages that are backed by physical memory are zeroed and - * initialized by going through __init_single_page(). But, there are some - * struct pages which are reserved in memblock allocator and their fields - * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly initialize those struct pages. - * - * This function also addresses a similar issue where struct pages are left - * uninitialized because the physical address range is not covered by - * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=, or when the highest physical - * address (max_pfn) does not end on a section boundary. - */ -static void __init init_unavailable_mem(void) -{ - phys_addr_t start, end; - u64 i, pgcnt; - phys_addr_t next = 0; - - /* - * Loop through unavailable ranges not covered by memblock.memory. - */ - pgcnt = 0; - for_each_mem_range(i, &start, &end) { - if (next < start) - pgcnt += init_unavailable_range(PFN_DOWN(next), - PFN_UP(start)); - next = end; - } - - /* - * Early sections always have a fully populated memmap for the whole - * section - see pfn_valid(). If the last section has holes at the - * end and that section is marked "online", the memmap will be - * considered initialized. Make sure that memmap has a well defined - * state. - */ - pgcnt += init_unavailable_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); - - /* - * Struct pages that do not have backing memory. This could be because - * firmware is using some of this memory, or for some other reasons. - */ - if (pgcnt) - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); -} -#else -static inline void __init init_unavailable_mem(void) -{ -} -#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7576,7 +7574,6 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid); diff --git a/mm/page_poison.c b/mm/page_poison.c index 65cdf844c8ad..655dc5895604 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -77,12 +77,14 @@ static void unpoison_page(struct page *page) void *addr; addr = kmap_atomic(page); + kasan_disable_current(); /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(addr, PAGE_SIZE); + check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 18b768ac7dca..095d7eaa0db4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -87,7 +87,7 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; -extern int pcpu_nr_empty_pop_pages; +extern int pcpu_nr_empty_pop_pages[]; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index c8400a2adbc2..f6026dbcdf6b 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -145,6 +145,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) int slot, max_nr_alloc; int *buffer; enum pcpu_chunk_type type; + int nr_empty_pop_pages; alloc_buffer: spin_lock_irq(&pcpu_lock); @@ -165,7 +166,11 @@ alloc_buffer: goto alloc_buffer; } -#define PL(X) \ + nr_empty_pop_pages = 0; + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type]; + +#define PL(X) \ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, @@ -196,7 +201,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); - P("empty_pop_pages", pcpu_nr_empty_pop_pages); + P("empty_pop_pages", nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu.c b/mm/percpu.c index 6596a0a4286e..23308113a5ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -173,10 +173,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ static LIST_HEAD(pcpu_map_extend_chunks); /* - * The number of empty populated pages, protected by pcpu_lock. The - * reserved chunk doesn't contribute to the count. + * The number of empty populated pages by chunk type, protected by pcpu_lock. + * The reserved chunk doesn't contribute to the count. */ -int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES]; /* * The number of populated pages in use by the allocator, protected by @@ -556,7 +556,7 @@ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; } /* @@ -1832,7 +1832,7 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ @@ -2000,7 +2000,7 @@ retry_pop: pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - - pcpu_nr_empty_pop_pages, + pcpu_nr_empty_pop_pages[type], 0, PCPU_EMPTY_POP_PAGES_HIGH); } @@ -2580,7 +2580,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = chunk; - pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; + pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ diff --git a/mm/ptdump.c b/mm/ptdump.c index 4354c1422d57..da751448d0e4 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = READ_ONCE(*pte); + pte_t val = ptep_get(pte); if (st->effective_prot) st->effective_prot(st, 4, pte_val(val)); diff --git a/mm/readahead.c b/mm/readahead.c index c5b0457415be..d589f147f4c2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -198,8 +198,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i); - BUG_ON(index + i != ractl->_index + ractl->_nr_pages); - if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch @@ -210,6 +208,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * not worth getting one just for that. */ read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } @@ -223,6 +222,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, gfp_mask) < 0) { put_page(page); read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) @@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl, * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long nr_to_read) + unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; @@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping, * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, - struct file_ra_state *ra, bool hit_readahead_marker, - unsigned long req_size) + bool hit_readahead_marker, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); + struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; unsigned long index = readahead_index(ractl); @@ -550,7 +551,7 @@ readit: } void page_cache_sync_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long req_count) + unsigned long req_count) { bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); @@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, * read-ahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ - if (!ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; @@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl, /* be dumb */ if (do_forced_ra) { - force_page_cache_ra(ractl, ra, req_count); + force_page_cache_ra(ractl, req_count); return; } /* do read-ahead */ - ondemand_readahead(ractl, ra, false, req_count); + ondemand_readahead(ractl, false, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct file_ra_state *ra, struct page *page, - unsigned long req_count) + struct page *page, unsigned long req_count) { /* no read-ahead */ - if (!ra->ra_pages) + if (!ractl->ra->ra_pages) return; /* @@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl, return; /* do read-ahead */ - ondemand_readahead(ractl, ra, true, req_count); + ondemand_readahead(ractl, true, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); @@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } + +/** + * readahead_expand - Expand a readahead request + * @ractl: The request to be expanded + * @new_start: The revised start + * @new_len: The revised size of the request + * + * Attempt to expand a readahead request outwards from the current size to the + * specified size by inserting locked pages before and after the current window + * to increase the size to the new window. This may involve the insertion of + * THPs, in which case the window may get expanded even beyond what was + * requested. + * + * The algorithm will stop if it encounters a conflicting page already in the + * pagecache and leave a smaller expansion than requested. + * + * The caller must check for this by examining the revised @ractl object for a + * different expansion than was requested. + */ +void readahead_expand(struct readahead_control *ractl, + loff_t new_start, size_t new_len) +{ + struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; + pgoff_t new_index, new_nr_pages; + gfp_t gfp_mask = readahead_gfp_mask(mapping); + + new_index = new_start / PAGE_SIZE; + + /* Expand the leading edge downwards */ + while (ractl->_index > new_index) { + unsigned long index = ractl->_index - 1; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + + ractl->_nr_pages++; + ractl->_index = page->index; + } + + new_len += new_start - readahead_pos(ractl); + new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); + + /* Expand the trailing edge upwards */ + while (ractl->_nr_pages < new_nr_pages) { + unsigned long index = ractl->_index + ractl->_nr_pages; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + ractl->_nr_pages++; + if (ra) { + ra->size++; + ra->async_size++; + } + } +} +EXPORT_SYMBOL(readahead_expand); diff --git a/mm/shuffle.c b/mm/shuffle.c index 9c2e145a747a..c13c33b247e8 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -147,8 +147,8 @@ void __meminit __shuffle_zone(struct zone *z) spin_unlock_irqrestore(&z->lock, flags); } -/** - * shuffle_free_memory - reduce the predictability of the page allocator +/* + * __shuffle_free_memory - reduce the predictability of the page allocator * @pgdat: node page data */ void __meminit __shuffle_free_memory(pg_data_t *pgdat) diff --git a/mm/slab.c b/mm/slab.c index 51fd424e0d6d..ae651bf540b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - if (!objp) + if (!objp || is_kfence_address(objp)) return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); diff --git a/mm/slab.h b/mm/slab.h index 076582f58f68..774c7221efdc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -601,7 +601,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_alloc)) { + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) @@ -613,7 +614,8 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) static inline bool slab_want_init_on_free(struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_free)) + if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; diff --git a/mm/slub.c b/mm/slub.c index e26c274b4657..7ed388077633 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, &page->flags); } @@ -706,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -799,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -1993,7 +1994,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) - continue; /* cmpxchg raced */ + break; available += objects; if (!object) { @@ -3898,7 +3899,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } diff --git a/mm/z3fold.c b/mm/z3fold.c index b5dafa7e44e4..9d889ad2bb86 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1346,8 +1346,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) page = list_entry(pos, struct page, lru); zhdr = page_address(page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* + * For non-headless pages, we wait to do this + * until we have the page lock to avoid racing + * with __z3fold_alloc(). Headless pages don't + * have a lock (and __z3fold_alloc() will never + * see them), but we still need to test and set + * PAGE_CLAIMED to avoid racing with + * z3fold_free(), so just do it now before + * leaving the loop. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; + break; + } if (kref_get_unless_zero(&zhdr->refcount) == 0) { zhdr = NULL; |