diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 698 |
1 files changed, 444 insertions, 254 deletions
@@ -134,6 +134,7 @@ void put_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(put_user_pages); +#ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -515,7 +516,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page_mask(struct vm_area_struct *vma, +static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { @@ -585,11 +586,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); - BUG_ON(pgd_none(*pgd)); + if (pgd_none(*pgd)) + return -EFAULT; p4d = p4d_offset(pgd, address); - BUG_ON(p4d_none(*p4d)); + if (p4d_none(*p4d)) + return -EFAULT; pud = pud_offset(p4d, address); - BUG_ON(pud_none(*pud)); + if (pud_none(*pud)) + return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; @@ -605,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); - - /* - * This should never happen (a device public page in the gate - * area). - */ - if (is_device_public_page(*page)) - goto unmap; } if (unlikely(!try_get_page(*page))) { ret = -ENOMEM; @@ -1042,10 +1039,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, BUG_ON(ret >= nr_pages); } - if (!pages) - /* If it's a prefault don't insist harder */ - return ret; - if (ret > 0) { nr_pages -= ret; pages_done += ret; @@ -1061,8 +1054,12 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages_done = ret; break; } - /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ - pages += ret; + /* + * VM_FAULT_RETRY triggered, so seek to the faulting offset. + * For the prefault case (!pages) we only update counts. + */ + if (likely(pages)) + pages += ret; start += ret << PAGE_SHIFT; /* @@ -1085,7 +1082,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages_done++; if (!nr_pages) break; - pages++; + if (likely(pages)) + pages++; start += PAGE_SIZE; } if (lock_dropped && *locked) { @@ -1100,86 +1098,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, } /* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - * get_user_pages_locked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * to: - * - * int locked = 1; - * down_read(&mm->mmap_sem); - * do_something() - * get_user_pages_locked(tsk, mm, ..., pages, &locked); - * if (locked) - * up_read(&mm->mmap_sem); - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - -/* - * get_user_pages_unlocked() is suitable to replace the form: - * - * down_read(&mm->mmap_sem); - * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); - * - * with: - * - * get_user_pages_unlocked(tsk, mm, ..., pages); - * - * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead if specific gup_flags - * (e.g. FOLL_FORCE) are not required. - */ -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - struct mm_struct *mm = current->mm; - int locked = 1; - long ret; - - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - down_read(&mm->mmap_sem); - ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); - if (locked) - up_read(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(get_user_pages_unlocked); - -/* * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1255,6 +1173,198 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ +#else /* CONFIG_MMU */ +static long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + struct vm_area_struct **vmas, int *locked, + unsigned int foll_flags) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + int i; + + /* calculate required read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < nr_pages; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + get_page(pages[i]); + } + if (vmas) + vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} +#endif /* !CONFIG_MMU */ + #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { @@ -1335,25 +1445,31 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, struct vm_area_struct **vmas, unsigned int gup_flags) { - long i; + unsigned long i; + unsigned long step; bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); check_again: - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_pages;) { + + struct page *head = compound_head(pages[i]); + + /* + * gup may start from a tail page. Advance step by the left + * part. + */ + step = (1 << compound_order(head)) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out * of the CMA zone if possible. */ - if (is_migrate_cma_page(pages[i])) { - - struct page *head = compound_head(pages[i]); - - if (PageHuge(head)) { + if (is_migrate_cma_page(head)) { + if (PageHuge(head)) isolate_huge_page(head, &cma_page_list); - } else { + else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; @@ -1368,6 +1484,8 @@ check_again: } } } + + i += step; } if (!list_empty(&cma_page_list)) { @@ -1416,7 +1534,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, { return nr_pages; } -#endif +#endif /* CONFIG_CMA */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -1502,155 +1620,88 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * populate_vma_page_range() - populate a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: - * - * This takes care of mlocking the pages too if VM_LOCKED is set. +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). * - * return 0 on success, negative error code on error. + * get_user_pages_locked() is suitable to replace the form: * - * vma->vm_mm->mmap_sem must be held. + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. + * to: * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); */ -long populate_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) { - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; - if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); } +EXPORT_SYMBOL(get_user_pages_locked); /* - * __mm_populate - populate and/or mlock pages within a range of address space. + * get_user_pages_unlocked() is suitable to replace the form: * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) { struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; + int locked = 1; + long ret; - end = start + len; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. populate_vma_page_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = populate_vma_page_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); if (locked) up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ -} - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; + return ret; } -#endif /* CONFIG_ELF_CORE */ +EXPORT_SYMBOL(get_user_pages_unlocked); /* - * Generic Fast GUP + * Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1682,20 +1733,64 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_GUP +#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking any + * locks. For this we would like to load the pointers atomically, but sometimes + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What + * we do have is the guarantee that a PTE will only either go from not present + * to present, or present to not present or both -- it will not switch to a + * completely different present page without a TLB flush in between; something + * that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. + * We load pte_high *after* loading pte_low, which ensures we don't see an older + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't + * picked up a changed pte high. We might have gotten rubbish values from + * pte_low and pte_high, but we are guaranteed that pte_low will not have the + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only + * operates on present ptes we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); -#ifndef gup_get_pte + return pte; +} +#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ /* - * We assume that the PTE can be read atomically. If this is not the case for - * your architecture, please provide the helper. + * We require that the PTE can be read atomically. */ static inline pte_t gup_get_pte(pte_t *ptep) { return READ_ONCE(*ptep); } -#endif +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; @@ -1800,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -1876,6 +1971,90 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, } #endif +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = READ_ONCE(*ptep); + + if (!pte_access_permitted(pte, write)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + head = try_get_compound_head(head, refs); + if (!head) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + SetPageReferenced(head); + return 1; +} + +static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned int pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} +#else +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, int write, + struct page **pages, int *nr) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { @@ -2116,19 +2295,21 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, return; } while (pgdp++, addr = next, addr != end); } +#else +static inline void gup_pgd_range(unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ +} +#endif /* CONFIG_HAVE_FAST_GUP */ #ifndef gup_fast_permitted /* * Check if it's allowed to use __get_user_pages_fast() for the range, or * we need to fall back to the slow version: */ -bool gup_fast_permitted(unsigned long start, int nr_pages) +static bool gup_fast_permitted(unsigned long start, unsigned long end) { - unsigned long len, end; - - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - return end >= start; + return true; } #endif @@ -2137,6 +2318,9 @@ bool gup_fast_permitted(unsigned long start, int nr_pages) * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -2145,10 +2329,12 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, unsigned long flags; int nr = 0; - start &= PAGE_MASK; + start = untagged_addr(start) & PAGE_MASK; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; + if (end <= start) + return 0; if (unlikely(!access_ok((void __user *)start, len))) return 0; @@ -2164,7 +2350,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * block IPIs that come from THPs splitting. */ - if (gup_fast_permitted(start, nr_pages)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); @@ -2172,6 +2359,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +EXPORT_SYMBOL_GPL(__get_user_pages_fast); static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2218,18 +2406,21 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned long addr, len, end; int nr = 0, ret = 0; - start &= PAGE_MASK; + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) + return -EINVAL; + + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; - if (nr_pages <= 0) + if (end <= start) return 0; - if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (gup_fast_permitted(start, nr_pages)) { + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && + gup_fast_permitted(start, end)) { local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); @@ -2255,5 +2446,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -#endif /* CONFIG_HAVE_GENERIC_GUP */ +EXPORT_SYMBOL_GPL(get_user_pages_fast); |