diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 364 |
1 files changed, 196 insertions, 168 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index 84381b55b2bd..ac6f4939bb59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) /* * Check PageMovable before holding a PG_lock because page's owner * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grapping the lock ruins page's owner side. + * so unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; @@ -326,17 +326,14 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count - * *must* be zero. And, we don't want to call wait_on_page_locked() - * against a page without get_page(). - * So, we use get_page_unless_zero(), here. Even failed, page fault - * will occur again. + * Once page cache replacement of page migration started, page_count + * is zero; but we must not call put_and_wait_on_page_locked() without + * a ref. Use get_page_unless_zero(), and just fault again if it fails. */ if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); return; out: pte_unmap_unlock(ptep, ptl); @@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - wait_on_page_locked(page); - put_page(page); + put_and_wait_on_page_locked(page); return; unlock: spin_unlock(ptl); } #endif -#ifdef CONFIG_BLOCK -/* Returns true if all buffers are successfully locked */ -static bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) +static int expected_page_refs(struct address_space *mapping, struct page *page) { - struct buffer_head *bh = head; - - /* Simple case, sync compaction */ - if (mode != MIGRATE_ASYNC) { - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + int expected_count = 1; - return true; - } - - /* async case, we cannot block on lock_buffer so use trylock_buffer */ - do { - get_bh(bh); - if (!trylock_buffer(bh)) { - /* - * We failed to lock the buffer and cannot stall in - * async migration. Release the taken locks - */ - struct buffer_head *failed_bh = bh; - put_bh(failed_bh); - bh = head; - while (bh != failed_bh) { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - } - return false; - } + /* + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); + if (mapping) + expected_count += hpage_nr_pages(page) + page_has_private(page); - bh = bh->b_this_page; - } while (bh != head); - return true; -} -#else -static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, - enum migrate_mode mode) -{ - return true; + return expected_count; } -#endif /* CONFIG_BLOCK */ /* * Replace the page in the mapping. @@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode, + struct page *newpage, struct page *page, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; - int expected_count = 1 + extra_count; - void **pslot; - - /* - * Device public or private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); - expected_count += is_device_public_page(page); + int expected_count = expected_page_refs(mapping, page) + extra_count; if (!mapping) { /* Anonymous page without mapping */ @@ -470,35 +424,14 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); - - expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); - return -EAGAIN; - } - - /* - * In the async migration case of moving a page with buffers, lock the - * buffers using trylock before the mapping is moved. If the mapping - * was moved, we later failed to lock the buffers and could not move - * the mapping back due to an elevated page count, we would have to - * block waiting on other references to be dropped. - */ - if (mode == MIGRATE_ASYNC && head && - !buffer_migrate_lock_buffers(head, mode)) { - page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -526,16 +459,13 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@ -546,7 +476,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -586,22 +516,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -610,11 +536,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } @@ -685,6 +611,8 @@ void migrate_page_states(struct page *newpage, struct page *page) SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); + if (PageWorkingset(page)) + SetPageWorkingset(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -758,7 +686,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -772,34 +700,94 @@ int migrate_page(struct address_space *mapping, EXPORT_SYMBOL(migrate_page); #ifdef CONFIG_BLOCK -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} + +static int __buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode, + bool check_refs) { struct buffer_head *bh, *head; int rc; + int expected_count; if (!page_has_buffers(page)) return migrate_page(mapping, newpage, page, mode); + /* Check whether page does not have extra refs before we do more work */ + expected_count = expected_page_refs(mapping, page); + if (page_count(page) != expected_count) + return -EAGAIN; + head = page_buffers(page); + if (!buffer_migrate_lock_buffers(head, mode)) + return -EAGAIN; - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); + if (check_refs) { + bool busy; + bool invalidated = false; - if (rc != MIGRATEPAGE_SUCCESS) - return rc; +recheck_buffers: + busy = false; + spin_lock(&mapping->private_lock); + bh = head; + do { + if (atomic_read(&bh->b_count)) { + busy = true; + break; + } + bh = bh->b_this_page; + } while (bh != head); + spin_unlock(&mapping->private_lock); + if (busy) { + if (invalidated) { + rc = -EAGAIN; + goto unlock_buffers; + } + invalidate_bh_lrus(); + invalidated = true; + goto recheck_buffers; + } + } - /* - * In the async case, migrate_page_move_mapping locked the buffers - * with an IRQ-safe spinlock held. In the sync case, the buffers - * need to be locked now - */ - if (mode != MIGRATE_ASYNC) - BUG_ON(!buffer_migrate_lock_buffers(head, mode)); + rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) + goto unlock_buffers; ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -821,17 +809,41 @@ int buffer_migrate_page(struct address_space *mapping, else migrate_page_states(newpage, page); + rc = MIGRATEPAGE_SUCCESS; +unlock_buffers: bh = head; do { unlock_buffer(bh); - put_bh(bh); bh = bh->b_this_page; } while (bh != head); - return MIGRATEPAGE_SUCCESS; + return rc; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. For example attached buffer heads are accessed only under page lock. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, false); } EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Same as above except that this variant is more careful and checks that there + * are also no buffer head references. This function is the right one for + * mappings where buffer heads are directly looked up and referenced (such as + * block device mappings). + */ +int buffer_migrate_page_norefs(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return __buffer_migrate_page(mapping, newpage, page, mode, true); +} #endif /* @@ -899,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping, */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_page(mapping, newpage, page, mode); } @@ -1118,10 +1130,13 @@ out: * If migration is successful, decrease refcount of the newpage * which will not free the page because new page owner increased * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. + * list in here. Use the old state of the isolated source page to + * determine if we migrated a LRU page. newpage was already unlocked + * and possibly modified by its owner - don't rely on the page + * state. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(__PageMovable(newpage))) + if (unlikely(!is_lru)) put_page(newpage); else putback_lru_page(newpage); @@ -1272,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct anon_vma *anon_vma = NULL; /* - * Movability of hugepages depends on architectures and hugepage size. + * Migratability of hugepages depends on architectures and their size. * This check is necessary because some callers of hugepage migration * like soft offline and memory hotremove don't walk through page * tables or check whether the hugepage is pmd-based or not before @@ -1300,6 +1315,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } + /* + * Check for pages which are in the process of being freed. Without + * page_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (page_private(hpage) && !page_mapping(hpage)) { + rc = -EBUSY; + goto out_unlock; + } + if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); @@ -1330,6 +1355,7 @@ put_anon: put_new_page = NULL; } +out_unlock: unlock_page(hpage); out: if (rc != -EAGAIN) @@ -1973,8 +1999,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int isolated = 0; struct page *new_page = NULL; int page_lru = page_is_file_cache(page); - unsigned long mmun_start = address & HPAGE_PMD_MASK; - unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), @@ -1997,15 +2022,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; new_page->index = page->index; + /* flush the cache before copying using the kernel virtual address */ + flush_cache_range(vma, start, start + HPAGE_PMD_SIZE); migrate_page_copy(new_page, page); WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -2029,16 +2054,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); /* - * Clear the old entry under pagetable lock and establish the new PTE. - * Any parallel GUP will either observe the old page blocking on the - * page lock, block on the page table lock or observe the new page. - * The SetPageUptodate on the new page and page_add_new_anon_rmap - * guarantee the copy is visible before the pagetable update. + * Overwrite the old entry under pagetable lock and establish + * the new PTE. Any parallel GUP will either observe the old + * page blocking on the page lock, block on the page table + * lock or observe the new page. The SetPageUptodate on the + * new page and page_add_new_anon_rmap guarantee the copy is + * visible before the pagetable update. + */ + page_add_anon_rmap(new_page, vma, start, true); + /* + * At this point the pmd is numa/protnone (i.e. non present) and the TLB + * has already been flushed globally. So no TLB can be currently + * caching this non present pmd mapping. There's no need to clear the + * pmd before doing set_pmd_at(), nor to flush the TLB after + * set_pmd_at(). Clearing the pmd here would introduce a race + * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the + * mmap_sem for reading. If the pmd is set to NULL at any given time, + * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this + * pmd. */ - flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start, true); - pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); page_ref_unfreeze(page, 2); @@ -2047,11 +2082,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2075,7 +2105,7 @@ out_fail: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); - set_pmd_at(mm, mmun_start, pmd, entry); + set_pmd_at(mm, start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } spin_unlock(ptl); @@ -2309,6 +2339,7 @@ next: */ static void migrate_vma_collect(struct migrate_vma *migrate) { + struct mmu_notifier_range range; struct mm_walk mm_walk; mm_walk.pmd_entry = migrate_vma_collect_pmd; @@ -2320,13 +2351,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_invalidate_range_start(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + migrate->end); + mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } @@ -2605,7 +2634,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(mm, pmdp, addr)) + if (pte_alloc(mm, pmdp)) goto abort; /* See the comment in pte_alloc_one_map() */ @@ -2707,9 +2736,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; - struct vm_area_struct *vma = migrate->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr, i, mmu_start; + struct mmu_notifier_range range; + unsigned long addr, i; bool notified = false; for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { @@ -2728,11 +2756,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate) continue; } if (!notified) { - mmu_start = addr; notified = true; - mmu_notifier_invalidate_range_start(mm, - mmu_start, - migrate->end); + + mmu_notifier_range_init(&range, + migrate->vma->vm_mm, + addr, migrate->end); + mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, &migrate->src[i], @@ -2773,8 +2802,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) * did already call it. */ if (notified) - mmu_notifier_invalidate_range_only_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(&range); } /* |