diff options
Diffstat (limited to 'mm/memory-failure.c')
| -rw-r--r-- | mm/memory-failure.c | 345 |
1 files changed, 212 insertions, 133 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f1c389f7e669..dcb6bb9cf731 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include <linux/ratelimit.h> #include <linux/page-isolation.h> #include <linux/pagewalk.h> +#include <linux/shmem_fs.h> #include "internal.h" #include "ras/ras_event.h" @@ -129,12 +130,6 @@ static int hwpoison_filter_dev(struct page *p) hwpoison_filter_dev_minor == ~0U) return 0; - /* - * page_mapping() does not accept slab pages. - */ - if (PageSlab(p)) - return -EINVAL; - mapping = page_mapping(p); if (mapping == NULL || mapping->host == NULL) return -EINVAL; @@ -257,16 +252,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); - if (flags & MF_ACTION_REQUIRED) { - if (t == current) - ret = force_sig_mceerr(BUS_MCEERR_AR, - (void __user *)tk->addr, addr_lsb); - else - /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ - ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, - addr_lsb, t); - } else { + if ((flags & MF_ACTION_REQUIRED) && (t == current)) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + else /* + * Signal other processes sharing the page if they have + * PF_MCE_EARLY set. * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS @@ -274,7 +266,6 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) */ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ - } if (ret < 0) pr_info("Memory failure: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); @@ -314,6 +305,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, pmd_t *pmd; pte_t *pte; + VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return 0; @@ -486,12 +478,13 @@ static struct task_struct *task_early_kill(struct task_struct *tsk, static void collect_procs_anon(struct page *page, struct list_head *to_kill, int force_early) { + struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma_read(page); + av = folio_lock_anon_vma_read(folio); if (av == NULL) /* Not actually mapped anymore */ return; @@ -706,8 +699,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { @@ -722,7 +717,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -737,9 +731,9 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_DIFFERENT_PAGE_SIZE] = "different page size", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -867,6 +861,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -896,17 +891,23 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) } /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } @@ -1154,18 +1155,40 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function * does not return true for hugetlb or device memory pages, so it's assumed * to be called only in the context where we never have such pages. */ -static inline bool HWPoisonHandlable(struct page *page) +static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) { - return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page); + bool movable = false; + + /* Soft offline could mirgate non-LRU movable pages */ + if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + movable = true; + + return movable || PageLRU(page) || is_free_buddy_page(page); } -static int __get_hwpoison_page(struct page *page) +static int __get_hwpoison_page(struct page *page, unsigned long flags) { struct page *head = compound_head(page); int ret = 0; @@ -1180,7 +1203,7 @@ static int __get_hwpoison_page(struct page *page) * for any unsupported type of page in order to reduce the risk of * unexpected races caused by taking a page refcount. */ - if (!HWPoisonHandlable(head)) + if (!HWPoisonHandlable(head, flags)) return -EBUSY; if (get_page_unless_zero(head)) { @@ -1205,7 +1228,7 @@ static int get_any_page(struct page *p, unsigned long flags) try_again: if (!count_increased) { - ret = __get_hwpoison_page(p); + ret = __get_hwpoison_page(p, flags); if (!ret) { if (page_count(p)) { /* We raced with an allocation, retry. */ @@ -1233,7 +1256,7 @@ try_again: } } - if (PageHuge(p) || HWPoisonHandlable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p, flags)) { ret = 1; } else { /* @@ -1256,6 +1279,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1263,7 +1307,7 @@ out: * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, @@ -1272,18 +1316,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1296,6 +1348,7 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page *hpage) { + struct folio *folio = page_folio(hpage); enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); @@ -1360,26 +1413,22 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - if (!PageHuge(hpage)) { - try_to_unmap(hpage, ttu); + if (PageHuge(hpage) && !PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); } else { - if (!PageAnon(hpage)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_page_mapping_lock_write(hpage); - if (mapping) { - try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(hpage, ttu); - } + try_to_unmap(folio, ttu); } unmap_success = !page_mapped(hpage); @@ -1475,7 +1524,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestClearPageHWPoison(head)) num_poisoned_pages_dec(); unlock_page(head); - return 0; + return -EOPNOTSUPP; } unlock_page(head); res = MF_FAILED; @@ -1492,14 +1541,25 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) } lock_page(head); + + /* + * The page could have changed compound pages due to race window. + * If this happens just bail out. + */ + if (!PageHuge(p) || compound_head(p) != head) { + action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED); + res = -EBUSY; + goto out; + } + page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(head)) + num_poisoned_pages_dec(); + put_page(p); + res = -EOPNOTSUPP; + goto out; } /* @@ -1553,6 +1613,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, } /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by * lock_page(), but dax pages do not use the page lock. This @@ -1564,7 +1630,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto out; if (hwpoison_filter(page)) { - rc = 0; + rc = -EOPNOTSUPP; goto unlock; } @@ -1589,7 +1655,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(page, &tokill, true); list_for_each_entry(tk, &tokill, nd) if (tk->size_shift) @@ -1604,7 +1670,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); + kill_procs(&tokill, true, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); @@ -1615,6 +1681,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1631,17 +1699,19 @@ out: * * Must run in process context (e.g. a work queue) with interrupts * enabled and no spinlocks hold. + * + * Return: 0 for successfully handled the memory error, + * -EOPNOTSUPP for memory_filter() filtered the error event, + * < 0(except -EOPNOTSUPP) on failure. */ int memory_failure(unsigned long pfn, int flags) { struct page *p; struct page *hpage; - struct page *orig_head; struct dev_pagemap *pgmap; int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1683,7 +1753,7 @@ try_again: goto unlock_mutex; } - orig_head = hpage = compound_head(p); + hpage = compound_head(p); num_poisoned_pages_inc(); /* @@ -1764,10 +1834,21 @@ try_again: lock_page(p); /* - * The page could have changed compound pages during the locking. - * If this happens just bail out. + * We're only intended to deal with the non-Compound page here. + * However, the page could have changed compound pages due to + * race window. If this happens, we could try again to hopefully + * handle the page next round. */ - if (PageCompound(p) && compound_head(p) != orig_head) { + if (PageCompound(p)) { + if (retry) { + if (TestClearPageHWPoison(p)) + num_poisoned_pages_dec(); + unlock_page(p); + put_page(p); + flags &= ~MF_COUNT_INCREASED; + retry = false; + goto try_again; + } action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto unlock_page; @@ -1782,21 +1863,12 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); put_page(p); + res = -EOPNOTSUPP; goto unlock_mutex; } @@ -1805,7 +1877,7 @@ try_again: * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ - if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) + if (!PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* @@ -1955,6 +2027,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1971,8 +2065,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1982,69 +2075,60 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); - return 0; - } + if (PageSlab(page) || PageTable(page)) + goto unlock_mutex; - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } - unlock_page(page); + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2087,7 +2171,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) */ static int __soft_offline_page(struct page *page) { - int ret = 0; + long ret = 0; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); char const *msg_page[] = {"page", "hugepage"}; @@ -2098,12 +2182,6 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - /* - * Check PageHWPoison again inside page lock because PageHWPoison - * is set by memory_failure() outside page lock. Note that - * memory_failure() also double-checks PageHWPoison inside page lock, - * so there's no race between soft_offline_page() and memory_failure(). - */ lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2114,7 +2192,7 @@ static int __soft_offline_page(struct page *page) return 0; } - if (!PageHuge(page)) + if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. @@ -2122,10 +2200,6 @@ static int __soft_offline_page(struct page *page) ret = invalidate_inode_page(page); unlock_page(page); - /* - * RED-PEN would be better to keep it isolated here, but we - * would need to fix isolation locking first. - */ if (ret) { pr_info("soft_offline: %#lx: invalidated\n", pfn); page_handle_poison(page, false, true); @@ -2144,7 +2218,7 @@ static int __soft_offline_page(struct page *page) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n", + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", pfn, msg_page[huge], ret, &page->flags); if (ret > 0) ret = -EBUSY; @@ -2225,15 +2299,18 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } retry: get_online_mems(); - ret = get_hwpoison_page(page, flags); + ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE); put_online_mems(); if (ret > 0) { @@ -2246,5 +2323,7 @@ retry: } } + mutex_unlock(&mf_mutex); + return ret; } |