aboutsummaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c522
1 files changed, 419 insertions, 103 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 95918f410c0f..dfc940d5221d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
+#include <linux/migrate.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -41,6 +42,7 @@
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
+#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -1318,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
@@ -1375,7 +1375,40 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->nr_huge_pages_node[nid]--;
}
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int zeroed;
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ INIT_LIST_HEAD(&page->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ if (adjust_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
+ }
+
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_page_private(page, 0);
+ SetHPageVmemmapOptimized(page);
+
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the last reference.
+ */
+ zeroed = put_page_testzero(page);
+ VM_BUG_ON_PAGE(!zeroed, page);
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
@@ -1383,6 +1416,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
+ if (alloc_huge_page_vmemmap(h, page)) {
+ spin_lock_irq(&hugetlb_lock);
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ add_hugetlb_page(h, page, true);
+ spin_unlock_irq(&hugetlb_lock);
+ return;
+ }
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1398,12 +1443,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
}
}
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ struct page *page;
+ struct hstate *h;
+
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ page->mapping = NULL;
+ /*
+ * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+ * is going to trigger because a previous call to
+ * remove_hugetlb_page() will set_compound_page_dtor(page,
+ * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ */
+ h = size_to_hstate(page_size(page));
+
+ __update_and_free_page(h, page);
+
+ cond_resched();
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+ if (free_vmemmap_pages_per_hpage(h))
+ flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+ bool atomic)
+{
+ if (!HPageVmemmapOptimized(page) || !atomic) {
+ __update_and_free_page(h, page);
+ return;
+ }
+
+ /*
+ * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+ *
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ schedule_work(&free_hpage_work);
+}
+
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page);
+ update_and_free_page(h, page, false);
cond_resched();
}
}
@@ -1470,12 +1582,12 @@ void free_huge_page(struct page *page)
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1493,8 +1605,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
+ free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -1504,15 +1617,15 @@ static void __prep_new_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
- __prep_new_huge_page(page);
+ __prep_new_huge_page(h, page);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
{
- int i;
+ int i, j;
int nr_pages = 1 << order;
struct page *p = page + 1;
@@ -1534,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* after get_user_pages().
*/
__ClearPageReserved(p);
+ /*
+ * Subtle and very unlikely
+ *
+ * Gigantic 'page allocators' such as memblock or cma will
+ * return a set of pages with each page ref counted. We need
+ * to turn this set of pages into a compound page with tail
+ * page ref counts set to zero. Code such as speculative page
+ * cache adding could take a ref on a 'to be' tail page.
+ * We need to respect any increased ref count, and only set
+ * the ref count to zero if count is currently 1. If count
+ * is not 1, we call synchronize_rcu in the hope that a rcu
+ * grace period will cause ref count to drop and then retry.
+ * If count is still inflated on retry we return an error and
+ * must discard the pages.
+ */
+ if (!page_ref_freeze(p, 1)) {
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ synchronize_rcu();
+ if (!page_ref_freeze(p, 1))
+ goto out_error;
+ }
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
+ return true;
+
+out_error:
+ /* undo tail page modifications made above */
+ p = page + 1;
+ for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+ /* need to clear PG_reserved on remaining tail pages */
+ for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ __ClearPageReserved(p);
+ set_compound_order(page, 0);
+ page[1].compound_nr = 0;
+ __ClearPageHead(page);
+ return false;
}
/*
@@ -1588,15 +1738,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
@@ -1661,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct page *page;
+ bool retry = false;
+retry:
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
@@ -1670,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
if (!page)
return NULL;
- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ /*
+ * Rare failure to convert pages to compound page.
+ * Free pages and try again - ONCE!
+ */
+ free_gigantic_page(page, huge_page_order(h));
+ if (!retry) {
+ retry = true;
+ goto retry;
+ }
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ return NULL;
+ }
+ }
prep_new_huge_page(h, page, page_to_nid(page));
return page;
@@ -1740,10 +1902,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
@@ -1785,19 +1951,38 @@ retry:
goto retry;
}
+ remove_hugetlb_page(h, head, false);
+ h->max_huge_pages--;
+ spin_unlock_irq(&hugetlb_lock);
+
/*
- * Move PageHWPoison flag from head page to the raw error page,
- * which makes any subpages rather than the error page reusable.
+ * Normally update_and_free_page will allocate required vmemmmap
+ * before freeing the page. update_and_free_page will fail to
+ * free the page if it can not allocate required vmemmap. We
+ * need to adjust max_huge_pages if the page is not freed.
+ * Attempt to allocate vmemmmap here so that we can take
+ * appropriate action on failure.
*/
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
+ rc = alloc_huge_page_vmemmap(h, head);
+ if (!rc) {
+ /*
+ * Move PageHWPoison flag from head page to the raw
+ * error page, which makes any subpages rather than
+ * the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
+ update_and_free_page(h, head, false);
+ } else {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_page(h, head, false);
+ h->max_huge_pages++;
+ spin_unlock_irq(&hugetlb_lock);
}
- remove_hugetlb_page(h, page, false);
- h->max_huge_pages--;
- spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, head);
- return 0;
+
+ return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
@@ -2121,12 +2306,18 @@ out:
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed. It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
+ VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
@@ -2170,11 +2361,21 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
+ case VMA_DEL_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ } else {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ }
+ break;
default:
BUG();
}
- if (vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
/*
* We know private mapping must have HPAGE_RESV_OWNER set.
@@ -2222,25 +2423,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
+static long vma_del_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
/*
- * This routine is called to restore a reservation on error paths. In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set
- * HPageRestoreReserve in the newly allocated page. When the page is freed
- * via free_huge_page, the global reservation count will be incremented if
- * HPageRestoreReserve is set. However, free_huge_page can not adjust the
- * reserve map. Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for pages allocated via alloc_huge_page(), and
+ * the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the page consumed the reservation.
+ * HPageRestoreReserve is set in the page.
+ * 2) No reservation was in place for the page, so HPageRestoreReserve is
+ * not set. However, alloc_huge_page always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count. But, free_huge_page does not have enough context
+ * to adjust the reservation map. This case deals primarily with private
+ * mappings. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
-static void restore_reserve_on_error(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address,
- struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct page *page)
{
- if (unlikely(HPageRestoreReserve(page))) {
- long rc = vma_needs_reservation(h, vma, address);
+ long rc = vma_needs_reservation(h, vma, address);
- if (unlikely(rc < 0)) {
+ if (HPageRestoreReserve(page)) {
+ if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
@@ -2253,16 +2468,57 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
- } else if (rc) {
- rc = vma_add_reservation(h, vma, address);
- if (unlikely(rc < 0))
+ else if (rc)
+ (void)vma_add_reservation(h, vma, address);
+ else
+ vma_end_reservation(h, vma, address);
+ } else {
+ if (!rc) {
+ /*
+ * This indicates there is an entry in the reserve map
+ * added by alloc_huge_page. We know it was added
+ * before the alloc_huge_page call, otherwise
+ * HPageRestoreReserve would be set on the page.
+ * Remove the entry so that a subsequent allocation
+ * does not consume a reservation.
+ */
+ rc = vma_del_reservation(h, vma, address);
+ if (rc < 0)
/*
- * See above comment about rare out of
- * memory condition.
+ * VERY rare out of memory condition. Since
+ * we can not delete the entry, set
+ * HPageRestoreReserve so that the reserve
+ * count will be incremented when the page
+ * is freed. This reserve will be consumed
+ * on a subsequent allocation.
*/
- ClearHPageRestoreReserve(page);
+ SetHPageRestoreReserve(page);
+ } else if (rc < 0) {
+ /*
+ * Rare out of memory condition from
+ * vma_needs_reservation call. Memory allocation is
+ * only attempted if a new entry is needed. Therefore,
+ * this implies there is not an entry in the
+ * reserve map.
+ *
+ * For shared mappings, no entry in the map indicates
+ * no reservation. We are done.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * For private mappings, no entry indicates
+ * a reservation is present. Since we can
+ * not add an entry, set SetHPageRestoreReserve
+ * on the page so reserve count will be
+ * incremented when freed. This reserve will
+ * be consumed on a subsequent allocation.
+ */
+ SetHPageRestoreReserve(page);
} else
- vma_end_reservation(h, vma, address);
+ /*
+ * No reservation present, do nothing
+ */
+ vma_end_reservation(h, vma, address);
}
}
@@ -2283,14 +2539,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
/*
* Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Using alloc_buddy_huge_page() allows us to
- * not having to deal with prep_new_huge_page() and avoids dealing of any
- * counters. This simplifies and let us do the whole thing under the
- * lock.
+ * pool to remain stable. Here, we allocate the page and 'prep' it
+ * by doing everything but actually updating counters and adding to
+ * the pool. This simplifies and let us do most of the processing
+ * under the lock.
*/
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ __prep_new_huge_page(h, new_page);
retry:
spin_lock_irq(&hugetlb_lock);
@@ -2329,14 +2586,9 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * new_page needs to be initialized with the standard hugetlb
- * state. This is normally done by prep_new_huge_page() but
- * that takes hugetlb_lock which is already held so we need to
- * open code it here.
* Reference count trick is needed because allocator gives us
* referenced page but the pool requires pages with 0 refcount.
*/
- __prep_new_huge_page(new_page);
__prep_account_new_huge_page(h, nid);
page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
@@ -2345,14 +2597,14 @@ retry:
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page);
+ update_and_free_page(h, old_page, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- __free_pages(new_page, huge_page_order(h));
+ update_and_free_page(h, new_page, false);
return ret;
}
@@ -2557,16 +2809,10 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
@@ -2575,20 +2821,23 @@ static void __init gather_bootmem_prealloc(void)
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, huge_page_order(h));
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* free it into the hugepage allocator */
+ if (prep_compound_gigantic_page(page, huge_page_order(h))) {
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* add to the hugepage allocator */
+ } else {
+ free_gigantic_page(page, huge_page_order(h));
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ }
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2766,6 +3015,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* pages in hstate via the proc/sysfs interfaces.
*/
mutex_lock(&h->resize_lock);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
/*
@@ -2875,6 +3125,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
while (count < persistent_huge_pages(h)) {
@@ -3382,6 +3633,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
@@ -3856,6 +4108,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3866,7 +4119,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
- entry = arch_make_huge_pte(entry, vma, page, writable);
+ entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
@@ -3989,12 +4242,13 @@ again:
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- if (is_write_migration_entry(swp_entry) && cow) {
+ if (is_writable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&swp_entry);
+ swp_entry = make_readable_migration_entry(
+ swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
@@ -4037,6 +4291,8 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
+ restore_reserve_on_error(h, vma, addr,
+ new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
@@ -4869,30 +5125,37 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
if (!page)
goto out;
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
@@ -4901,12 +5164,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
+ /* Free the allocated page which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ put_page(page);
+
+ /* Allocate a temporary page to hold the copied
+ * contents.
+ */
+ page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- /* don't free the page */
+ /* Set the outparam pagep and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * page.
+ */
goto out;
}
} else {
- page = *pagep;
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ put_page(*pagep);
+ ret = -EEXIST;
+ *pagep = NULL;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ *pagep = NULL;
+ goto out;
+ }
+ copy_huge_page(page, *pagep);
+ put_page(*pagep);
*pagep = NULL;
}
@@ -4996,6 +5291,7 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
@@ -5144,8 +5440,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
continue;
}
- refs = min3(pages_per_huge_page(h) - pfn_offset,
- (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+ /* vaddr may not be aligned to PAGE_SIZE */
+ refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+ (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
record_subpages_vmas(mem_map_offset(page, pfn_offset),
@@ -5237,10 +5534,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pte_t newpte;
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
@@ -5251,10 +5549,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
@@ -5847,6 +6146,23 @@ unlock:
return ret;
}
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ int ret = 0;
+
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHeadHuge(page)) {
+ *hugetlb = true;
+ if (HPageFreed(page) || HPageMigratable(page))
+ ret = get_page_unless_zero(page);
+ else
+ ret = -EBUSY;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);