From d08b3851da41d0ee60851f2c75b118e1f7a5fc89 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Sep 2006 23:30:57 -0700 Subject: [PATCH] mm: tracking shared dirty pages Tracking of dirty pages in shared writeable mmap()s. The idea is simple: write protect clean shared writeable pages, catch the write-fault, make writeable and set dirty. On page write-back clean all the PTE dirty bits and write protect them once again. The implementation is a tad harder, mainly because the default backing_dev_info capabilities were too loosely maintained. Hence it is not enough to test the backing_dev_info for cap_account_dirty. The current heuristic is as follows, a VMA is eligible when: - its shared writeable (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED) - it is not a 'special' mapping (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0 - the backing_dev_info is cap_account_dirty mapping_cap_account_dirty(vma->vm_file->f_mapping) - f_op->mmap() didn't change the default page protection Page from remap_pfn_range() are explicitly excluded because their COW semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and because they don't have a backing store anyway. mprotect() is taught about the new behaviour as well. However it overrides the last condition. Cleaning the pages on write-back is done with page_mkclean() a new rmap call. It can be called on any page, but is currently only implemented for mapped pages, if the page is found the be of a VMA that accounts dirty pages it will also wrprotect the PTE. Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from under ->private_lock. This seems to be safe, since ->private_lock is used to serialize access to the buffers, not the page itself. This is needed because clear_page_dirty() will call into page_mkclean() and would thereby violate locking order. [dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU] Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 109e9866237e..fa941b169071 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse, ret = VM_FAULT_MINOR; + int reuse = 0, ret = VM_FAULT_MINOR; + struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == - (VM_SHARED|VM_WRITE))) { + /* + * Only catch write-faults on shared writable pages, read-only + * shared pages can get COWed by get_user_pages(.write=1, .force=1). + */ + if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to @@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) goto unlock; } - + dirty_page = old_page; + get_page(dirty_page); reuse = 1; } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); - } else { - reuse = 0; } if (reuse) { @@ -1566,6 +1570,10 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return ret; oom: if (old_page) @@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); @@ -2192,6 +2201,10 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) { + dirty_page = new_page; + get_page(dirty_page); + } } } else { /* One of our sibling threads was faster, back out. */ @@ -2204,6 +2217,10 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return ret; oom: page_cache_release(new_page); -- cgit From edc79b2a46ed854595e40edcf3f8b37f9f14aa3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Sep 2006 23:30:58 -0700 Subject: [PATCH] mm: balance dirty pages Now that we can detect writers of shared mappings, throttle them. Avoids OOM by surprise. Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 1 + mm/memory.c | 5 +++-- mm/page-writeback.c | 10 ++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0422036af4eb..56a23a0e7f2e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -116,6 +116,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count); int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count); +void set_page_dirty_balance(struct page *page); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl diff --git a/mm/memory.c b/mm/memory.c index fa941b169071..dd7d7fc5ed60 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1571,7 +1572,7 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - set_page_dirty(dirty_page); + set_page_dirty_balance(dirty_page); put_page(dirty_page); } return ret; @@ -2218,7 +2219,7 @@ retry: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - set_page_dirty(dirty_page); + set_page_dirty_balance(dirty_page); put_page(dirty_page); } return ret; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1c87430b7a25..b9f4c6f1be86 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -244,6 +244,16 @@ static void balance_dirty_pages(struct address_space *mapping) pdflush_operation(background_writeout, 0); } +void set_page_dirty_balance(struct page *page) +{ + if (set_page_dirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping) + balance_dirty_pages_ratelimited(mapping); + } +} + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied -- cgit From ee6a6457886a80415db209e87033b63f2b06558c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Sep 2006 23:31:00 -0700 Subject: [PATCH] mm: fixup do_wp_page() Wrt. the recent modifications in do_wp_page() Hugh Dickins pointed out: "I now realize it's right to the first order (normal case) and to the second order (ptrace poke), but not to the third order (ptrace poke anon page here to be COWed - perhaps can't occur without intervening mprotects)." This patch restores the old COW behaviour for anonymous pages. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index dd7d7fc5ed60..65962534b4ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1467,11 +1467,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto gotten; /* - * Only catch write-faults on shared writable pages, read-only - * shared pages can get COWed by get_user_pages(.write=1, .force=1). + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. */ - if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + if (PageAnon(old_page)) { + if (!TestSetPageLocked(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); + } + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to @@ -1503,9 +1513,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, dirty_page = old_page; get_page(dirty_page); reuse = 1; - } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); } if (reuse) { -- cgit From bfa5bf6d6446f0028187a727f792fbc7934228ad Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 25 Sep 2006 23:31:22 -0700 Subject: [PATCH] Add kerneldocs for some functions in mm/memory.c These functions are already documented quite well with long comments. Now add kerneldoc style header to make this turn up in everyones favorite doc format. Signed-off-by: Rolf Eike Beer Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 65962534b4ed..92a3ebd8d795 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1227,7 +1227,12 @@ out: return retval; } -/* +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * * This allows drivers to insert individual pages they've allocated * into a user vma. * @@ -1319,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* Note: this is only safe if the mm semaphore is held when called. */ +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -1801,9 +1815,10 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating * * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last @@ -1872,11 +1887,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) } EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ -/* +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @addr: address to start + * @vma: user vma this addresses belong to + * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... + * the 'original' request together with the readahead ones... * * This has been extended to use the NUMA policies from the mm triggering * the readahead. -- cgit From f4b81804a2d1ab341a4613089dc31ecce0800ed8 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Wed, 27 Sep 2006 01:50:10 -0700 Subject: [PATCH] do_no_pfn() Implement do_no_pfn() for handling mapping of memory without a struct page backing it. This avoids creating fake page table entries for regions which are not backed by real memory. This feature is used by the MSPEC driver and other users, where it is highly undesirable to have a struct page sitting behind the page (for instance if the page is accessed in cached mode via the struct page in parallel to the the driver accessing it uncached, which can result in data corruption on some architectures, such as ia64). This version uses specific NOPFN_{SIGBUS,OOM} return values, rather than expect all negative pfn values would be an error. It also bugs on cow mappings as this would not work with the VM. [akpm@osdl.org: micro-optimise] Signed-off-by: Jes Sorensen Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 ++++++ mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 5 deletions(-) (limited to 'mm/memory.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e433bbc6e7e..22165cb18906 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -199,6 +199,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); + unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address); int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); /* notification that a previously read-only page is about to become @@ -593,6 +594,12 @@ static inline int page_mapped(struct page *page) #define NOPAGE_SIGBUS (NULL) #define NOPAGE_OOM ((struct page *) (-1)) +/* + * Error return values for the *_nopfn functions + */ +#define NOPFN_SIGBUS ((unsigned long) -1) +#define NOPFN_OOM ((unsigned long) -2) + /* * Different kinds of faults, as returned by handle_mm_fault(). * Used to decide whether a process gets delivered SIGBUS or diff --git a/mm/memory.c b/mm/memory.c index 92a3ebd8d795..f2ef1dcfff77 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2255,6 +2255,54 @@ oom: return VM_FAULT_OOM; } +/* + * do_no_pfn() tries to create a new page mapping for a page without + * a struct_page backing it + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * It is expected that the ->nopfn handler always returns the same pfn + * for a given virtual mapping. + * + * Mark this `noinline' to prevent it from bloating the main pagefault code. + */ +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + pte_t entry; + unsigned long pfn; + int ret = VM_FAULT_MINOR; + + pte_unmap(page_table); + BUG_ON(!(vma->vm_flags & VM_PFNMAP)); + BUG_ON(is_cow_mapping(vma->vm_flags)); + + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + if (pfn == NOPFN_OOM) + return VM_FAULT_OOM; + if (pfn == NOPFN_SIGBUS) + return VM_FAULT_SIGBUS; + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + entry = pfn_pte(pfn, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + } + pte_unmap_unlock(page_table, ptl); + return ret; +} + /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + if (vma->vm_ops) { + if (vma->vm_ops->nopage) + return do_no_page(mm, vma, address, + pte, pmd, + write_access); + if (unlikely(vma->vm_ops->nopfn)) + return do_no_pfn(mm, vma, address, pte, + pmd, write_access); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); } if (pte_file(entry)) return do_file_page(mm, vma, address, -- cgit From 0ec76a110f432e98277e464b82ace8dd66571689 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Sep 2006 01:50:15 -0700 Subject: [PATCH] NOMMU: Check that access_process_vm() has a valid target Check that access_process_vm() is accessing a valid mapping in the target process. This limits ptrace() accesses and accesses through /proc//maps to only those regions actually mapped by a program. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 54 ------------------------------------------------------ mm/memory.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 54 deletions(-) (limited to 'mm/memory.c') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9a111f70145c..8aad0331d82e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) return 0; } -/* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages - */ - -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; - - mm = get_task_mm(tsk); - if (!mm) - return 0; - - down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(tsk, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); - } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); - } - kunmap(page); - page_cache_release(page); - len -= bytes; - buf += bytes; - addr += bytes; - } - up_read(&mm->mmap_sem); - mmput(mm); - - return buf - old_buf; -} - int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; diff --git a/mm/memory.c b/mm/memory.c index f2ef1dcfff77..601159a46ab6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2604,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} diff --git a/mm/nommu.c b/mm/nommu.c index d99dea31e443..d08acdae0036 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1206,3 +1206,50 @@ struct page *filemap_nopage(struct vm_area_struct *area, BUG(); return NULL; } + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct vm_list_struct *vml; + struct vm_area_struct *vma; + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + + /* the access must start within one of the target process's mappings */ + for (vml = mm->context.vmlist; vml; vml = vml->next) + if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) + break; + + if (vml) { + vma = vml->vma; + + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_WRITE) + len -= copy_to_user((void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_READ) + len -= copy_from_user(buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + up_read(&mm->mmap_sem); + mmput(mm); + return len; +} -- cgit