diff options
Diffstat (limited to 'mm/mremap.c')
-rw-r--r-- | mm/mremap.c | 113 |
1 files changed, 102 insertions, 11 deletions
diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..e3edef6b7a12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -191,22 +191,66 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifdef CONFIG_HAVE_MOVE_PMD +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) + || old_end - old_addr < PMD_SIZE) + return false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + /* Set the new pmd */ + set_pmd_at(mm, new_addr, new_pmd, pmd); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#endif + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks) { unsigned long extent, next, old_end; + struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmun_start = old_addr; - mmun_end = old_end; - mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -237,8 +281,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma, split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; + } else if (extent == PMD_SIZE) { +#ifdef CONFIG_HAVE_MOVE_PMD + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + bool moved; + + if (need_rmap_locks) + take_rmap_locks(vma); + moved = move_normal_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); + if (need_rmap_locks) + drop_rmap_locks(vma); + if (moved) + continue; +#endif } - if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) + + if (pte_alloc(new_vma->vm_mm, new_pmd)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) @@ -247,7 +309,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd, new_addr, need_rmap_locks); } - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return len + old_addr - old_end; /* how much done */ } @@ -454,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmaping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -521,6 +600,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); @@ -557,12 +637,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_munmap does all the needed commit accounting + * __do_munmap does all the needed commit accounting, and + * downgrades mmap_sem to read if so directed. */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); - if (ret && old_len != new_len) + int retval; + + retval = __do_munmap(mm, addr+new_len, old_len - new_len, + &uf_unmap, true); + if (retval < 0 && old_len != new_len) { + ret = retval; goto out; + /* Returning 1 indicates mmap_sem is downgraded to read. */ + } else if (retval == 1) + downgraded = true; ret = addr; goto out; } @@ -627,7 +715,10 @@ out: vm_unacct_memory(charged); locked = 0; } - up_write(¤t->mm->mmap_sem); + if (downgraded) + up_read(¤t->mm->mmap_sem); + else + up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); |