diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 170 |
1 files changed, 102 insertions, 68 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fe121fdb05f7..4d6e43c88489 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,7 +39,6 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> -#include <linux/kernel-page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/dax.h> @@ -50,7 +49,6 @@ #include <linux/swap.h> #include <linux/backing-dev.h> #include <linux/migrate.h> -#include <linux/suspend.h> #include <linux/slab.h> #include <linux/swapops.h> #include <linux/hugetlb.h> @@ -59,7 +57,6 @@ #include <linux/memremap.h> #include <linux/kfifo.h> #include <linux/ratelimit.h> -#include <linux/page-isolation.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/sysctl.h> @@ -75,13 +72,15 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -inline void num_poisoned_pages_inc(unsigned long pfn) +static DEFINE_MUTEX(mf_mutex); + +void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); memblk_nr_poison_inc(pfn); } -inline void num_poisoned_pages_sub(unsigned long pfn, long i) +void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); if (pfn != -1UL) @@ -363,17 +362,14 @@ void shake_page(struct page *p) { if (PageHuge(p)) return; - - if (!PageSlab(p)) { - lru_add_drain_all(); - if (PageLRU(p) || is_free_buddy_page(p)) - return; - } - /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ + if (PageSlab(p)) + return; + + lru_add_drain_all(); } EXPORT_SYMBOL_GPL(shake_page); @@ -551,8 +547,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * on behalf of the thread group. Return task_struct of the (first found) * dedicated thread if found, and return NULL otherwise. * - * We already hold read_lock(&tasklist_lock) in the caller, so we don't - * have to call rcu_read_lock/unlock() in this function. + * We already hold rcu lock in the caller, so we don't have to call + * rcu_read_lock/unlock() in this function. */ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { @@ -613,8 +609,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, return; pgoff = page_to_pgoff(page); - read_lock(&tasklist_lock); - for_each_process (tsk) { + rcu_read_lock(); + for_each_process(tsk) { struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); @@ -630,7 +626,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, add_to_kill_anon_file(t, page, vma, to_kill); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); anon_vma_unlock_read(av); } @@ -646,7 +642,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, pgoff_t pgoff; i_mmap_lock_read(mapping); - read_lock(&tasklist_lock); + rcu_read_lock(); pgoff = page_to_pgoff(page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); @@ -658,7 +654,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, /* * Send early kill signal to tasks where a vma covers * the page but the corrupted page is not necessarily - * mapped it in its pte. + * mapped in its pte. * Assume applications who requested early kill want * to be informed of all such data corruptions. */ @@ -666,7 +662,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, add_to_kill_anon_file(t, page, vma, to_kill); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); i_mmap_unlock_read(mapping); } @@ -689,7 +685,7 @@ static void collect_procs_fsdax(struct page *page, struct task_struct *tsk; i_mmap_lock_read(mapping); - read_lock(&tasklist_lock); + rcu_read_lock(); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, true); @@ -700,7 +696,7 @@ static void collect_procs_fsdax(struct page *page, add_to_kill_fsdax(t, page, vma, to_kill, pgoff); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); i_mmap_unlock_read(mapping); } #endif /* CONFIG_FS_DAX */ @@ -721,7 +717,7 @@ static void collect_procs(struct page *page, struct list_head *tokill, collect_procs_file(page, tokill, force_early); } -struct hwp_walk { +struct hwpoison_walk { struct to_kill tk; unsigned long pfn; int flags; @@ -756,7 +752,7 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, - struct hwp_walk *hwp) + struct hwpoison_walk *hwp) { pmd_t pmd = *pmdp; unsigned long pfn; @@ -774,7 +770,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, } #else static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, - struct hwp_walk *hwp) + struct hwpoison_walk *hwp) { return 0; } @@ -783,7 +779,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = walk->private; + struct hwpoison_walk *hwp = walk->private; int ret = 0; pte_t *ptep, *mapped_pte; spinlock_t *ptl; @@ -817,7 +813,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = walk->private; + struct hwpoison_walk *hwp = walk->private; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(walk->vma); @@ -828,7 +824,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif -static const struct mm_walk_ops hwp_walk_ops = { +static const struct mm_walk_ops hwpoison_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, .walk_lock = PGWALK_RDLOCK, @@ -851,7 +847,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, int flags) { int ret; - struct hwp_walk priv = { + struct hwpoison_walk priv = { .pfn = pfn, }; priv.tk.tsk = p; @@ -860,7 +856,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, return -EFAULT; mmap_read_lock(p->mm); - ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); @@ -940,14 +936,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn, struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { + if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_NOIO)) { + else if (!filemap_release_folio(folio, GFP_NOIO)) pr_info("%#lx: failed to release buffers\n", pfn); - } else { + else ret = MF_RECOVERED; - } } else { /* * If the file system doesn't support it just invalidate @@ -1193,9 +1187,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) struct address_space *mapping; bool extra_pins = false; - if (!PageHuge(hpage)) - return MF_DELAYED; - mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); @@ -1395,8 +1386,15 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) { + folio_put(folio); + folio = page_folio(page); + } + } /* * This check prevents from calling folio_try_get() for any @@ -1485,8 +1483,13 @@ static int __get_unpoison_page(struct page *page) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) + folio_put(folio); + } /* * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, @@ -1559,7 +1562,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p) || PageTable(p)) + if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) return true; if (!(PageLRU(hpage) || PageHuge(p))) return true; @@ -1814,6 +1817,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ #ifdef CONFIG_HUGETLB_PAGE + /* * Struct raw_hwp_page represents information about "raw error page", * constructing singly linked list from ->_hugetlb_hwpoison field of folio. @@ -1828,16 +1832,49 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) return (struct llist_head *)&folio->_hugetlb_hwpoison; } +bool is_raw_hwpoison_page_in_hugepage(struct page *page) +{ + struct llist_head *raw_hwp_head; + struct raw_hwp_page *p; + struct folio *folio = page_folio(page); + bool ret = false; + + if (!folio_test_hwpoison(folio)) + return false; + + if (!folio_test_hugetlb(folio)) + return PageHWPoison(page); + + /* + * When RawHwpUnreliable is set, kernel lost track of which subpages + * are HWPOISON. So return as if ALL subpages are HWPOISONed. + */ + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) + return true; + + mutex_lock(&mf_mutex); + + raw_hwp_head = raw_hwp_list_head(folio); + llist_for_each_entry(p, raw_hwp_head->first, node) { + if (page == p->page) { + ret = true; + break; + } + } + + mutex_unlock(&mf_mutex); + + return ret; +} + static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { - struct llist_head *head; - struct llist_node *t, *tnode; + struct llist_node *head; + struct raw_hwp_page *p, *next; unsigned long count = 0; - head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + head = llist_del_all(raw_hwp_list_head(folio)); + llist_for_each_entry_safe(p, next, head, node) { if (move_flag) SetPageHWPoison(p->page); else @@ -1845,7 +1882,6 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) kfree(p); count++; } - llist_del_all(head); return count; } @@ -1853,7 +1889,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; - struct llist_node *t, *tnode; + struct raw_hwp_page *p, *next; int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* @@ -1864,9 +1900,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + llist_for_each_entry_safe(p, next, head->first, node) { if (p->page == page) return -EHWPOISON; } @@ -1917,6 +1951,8 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) { if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; + if (folio_test_hugetlb_vmemmap_optimized(folio)) + return; folio_clear_hwpoison(folio); folio_free_raw_hwp(folio, true); } @@ -2081,8 +2117,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, { int rc = -ENXIO; - put_ref_page(pfn, flags); - /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) goto out; @@ -2105,12 +2139,11 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); - action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); + if (rc != -EOPNOTSUPP) + action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); return rc; } -static DEFINE_MUTEX(mf_mutex); - /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2126,7 +2159,7 @@ static DEFINE_MUTEX(mf_mutex); * detected by a background scrubber) * * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. + * enabled and no spinlocks held. * * Return: 0 for successfully handled the memory error, * -EOPNOTSUPP for hwpoison_filter() filtered the error event, @@ -2158,6 +2191,7 @@ int memory_failure(unsigned long pfn, int flags) if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); + put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, pgmap); @@ -2184,8 +2218,6 @@ try_again: goto unlock_mutex; } - hpage = compound_head(p); - /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: @@ -2224,13 +2256,14 @@ try_again: } } + hpage = compound_head(p); if (PageTransHuge(hpage)) { /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since * it is called by soft offline too and it is just called - * for !MF_COUNT_INCREASE. So here seems to be the best + * for !MF_COUNT_INCREASED. So here seems to be the best * place. * * Don't need care about the above error handling paths for @@ -2500,7 +2533,8 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) + if (folio_test_slab(folio) || PageTable(&folio->page) || + folio_test_reserved(folio) || PageOffline(&folio->page)) goto unlock_mutex; /* @@ -2590,10 +2624,10 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) /* * If we succeed to isolate the page, we grabbed another refcount on - * the page, so we can safely drop the one we got from get_any_pages(). + * the page, so we can safely drop the one we got from get_any_page(). * If we failed to isolate the page, it means that we cannot go further * and we will return an error, so drop the reference we got from - * get_any_pages() as well. + * get_any_page() as well. */ put_page(page); return isolated; @@ -2626,7 +2660,7 @@ static int soft_offline_in_use_page(struct page *page) } lock_page(page); - if (!PageHuge(page)) + if (!huge) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); @@ -2635,7 +2669,7 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) + if (!huge && PageLRU(page) && !PageSwapCache(page)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. |