aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Hildenbrand <[email protected]>2022-03-24 18:13:40 -0700
committerLinus Torvalds <[email protected]>2022-03-24 19:06:50 -0700
commitc145e0b47c77ebeefdfd73dbb344577b2fc9b065 (patch)
tree9524a40d5301a81867a5ef46025476bbc2e0b5b1
parent84d60fdd3733fb86c126f2adfd0361fdc44087c3 (diff)
mm: streamline COW logic in do_swap_page()
Currently we have a different COW logic when: * triggering a read-fault to swapin first and then trigger a write-fault -> do_swap_page() + do_wp_page() * triggering a write-fault to swapin -> do_swap_page() + do_wp_page() only if we fail reuse in do_swap_page() The COW logic in do_swap_page() is different than our reuse logic in do_wp_page(). The COW logic in do_wp_page() -- page_count() == 1 -- makes currently sure that we certainly don't have a remaining reference, e.g., via GUP, on the target page we want to reuse: if there is any unexpected reference, we have to copy to avoid information leaks. As do_swap_page() behaves differently, in environments with swap enabled we can currently have an unintended information leak from the parent to the child, similar as known from CVE-2020-29374: 1. Parent writes to anonymous page -> Page is mapped writable and modified 2. Page is swapped out -> Page is unmapped and replaced by swap entry 3. fork() -> Swap entries are copied to child 4. Child pins page R/O -> Page is mapped R/O into child 5. Child unmaps page -> Child still holds GUP reference 6. Parent writes to page -> Page is reused in do_swap_page() -> Child can observe changes Exchanging 2. and 3. should have the same effect. Let's apply the same COW logic as in do_wp_page(), conditionally trying to remove the page from the swapcache after freeing the swap entry, however, before actually mapping our page. We can change the order now that we use try_to_free_swap(), which doesn't care about the mapcount, instead of reuse_swap_page(). To handle references from the LRU pagevecs, conditionally drain the local LRU pagevecs when required, however, don't consider the page_count() when deciding whether to drain to keep it simple for now. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: David Hildenbrand <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: David Rientjes <[email protected]> Cc: Don Dutile <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Jan Kara <[email protected]> Cc: Jann Horn <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: John Hubbard <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Liang Zhang <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Mike Kravetz <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Nadav Amit <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Peter Xu <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Yang Shi <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
-rw-r--r--mm/memory.c55
1 files changed, 43 insertions, 12 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 8e30675dc077..f721735ff947 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3489,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
+static inline bool should_try_to_free_swap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!PageSwapCache(page))
+ return false;
+ if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+ PageMlocked(page))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ page_count(page) == 2;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3630,6 +3649,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = swapcache;
goto out_page;
}
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * pagevecs if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+ !PageKsm(page) && !PageLRU(page))
+ lru_add_drain();
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3648,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
*/
+ swap_free(entry);
+ if (should_try_to_free_swap(page, vma, vmf->flags))
+ try_to_free_swap(page);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for fresh pages
+ * that are certainly not shared because we just allocated them without
+ * exposing them to the swapcache.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ (page != swapcache || page_count(page) == 1)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3686,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*