diff options
Diffstat (limited to 'include/linux/mm.h')
| -rw-r--r-- | include/linux/mm.h | 313 | 
1 files changed, 266 insertions, 47 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index c54fb96cb1e6..5a323422d783 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -27,6 +27,7 @@  #include <linux/memremap.h>  #include <linux/overflow.h>  #include <linux/sizes.h> +#include <linux/sched.h>  struct mempolicy;  struct anon_vma; @@ -342,6 +343,20 @@ extern unsigned int kobjsize(const void *objp);  /* Bits set in the VMA until the stack is in its final location */  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ) +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \ +				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC	(VM_READ | VM_WRITE | VM_MAYREAD | \ +				 VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC	(VM_READ | VM_WRITE | VM_EXEC | \ +				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS		/* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC +#endif +  #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS  #endif @@ -354,12 +369,18 @@ extern unsigned int kobjsize(const void *objp);  #define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +  /*   * Special vmas that are non-mergable, non-mlock()able. - * Note: mm/huge_memory.c VM_NO_THP depends on this definition.   */  #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* This mask prevents VMA from being scanned with khugepaged */ +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) +  /* This mask defines which mm->def_flags a process can inherit its parent */  #define VM_INIT_DEF_MASK	VM_NOHUGEPAGE @@ -378,15 +399,75 @@ extern unsigned int kobjsize(const void *objp);   */  extern pgprot_t protection_map[16]; -#define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */ -#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED	0x20	/* Second try */ -#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */ -#define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */ -#define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */ +/** + * Fault flag definitions. + * + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_sem and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly.  Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and + *                              this is the first try + * + * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and + *                              we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used.  Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b).  We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +#define FAULT_FLAG_WRITE			0x01 +#define FAULT_FLAG_MKWRITE			0x02 +#define FAULT_FLAG_ALLOW_RETRY			0x04 +#define FAULT_FLAG_RETRY_NOWAIT			0x08 +#define FAULT_FLAG_KILLABLE			0x10 +#define FAULT_FLAG_TRIED			0x20 +#define FAULT_FLAG_USER				0x40 +#define FAULT_FLAG_REMOTE			0x80 +#define FAULT_FLAG_INSTRUCTION  		0x100 +#define FAULT_FLAG_INTERRUPTIBLE		0x200 + +/* + * The default fault flags that should be used by most of the + * arch-specific page fault handlers. + */ +#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \ +			     FAULT_FLAG_KILLABLE | \ +			     FAULT_FLAG_INTERRUPTIBLE) + +/** + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * + * This is mostly used for places where we want to try to avoid taking + * the mmap_sem for too long a time when waiting for another condition + * to change, in which case we can try to be polite to release the + * mmap_sem in the first round to avoid potential starvation of other + * processes that would also want the mmap_sem. + * + * Return: true if the page fault allows retry and this is the first + * attempt of the fault handling; false otherwise. + */ +static inline bool fault_flag_allow_retry_first(unsigned int flags) +{ +	return (flags & FAULT_FLAG_ALLOW_RETRY) && +	    (!(flags & FAULT_FLAG_TRIED)); +}  #define FAULT_FLAG_TRACE \  	{ FAULT_FLAG_WRITE,		"WRITE" }, \ @@ -397,7 +478,8 @@ extern pgprot_t protection_map[16];  	{ FAULT_FLAG_TRIED,		"TRIED" }, \  	{ FAULT_FLAG_USER,		"USER" }, \  	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \ -	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" } +	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \ +	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }  /*   * vm_fault is filled by the the pagefault handler and passed to the vma's @@ -541,6 +623,36 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)  	return !vma->vm_ops;  } +static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +{ +	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + +	if (!maybe_stack) +		return false; + +	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == +						VM_STACK_INCOMPLETE_SETUP) +		return true; + +	return false; +} + +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ +	if (!current->mm) +		return true; + +	if (current->mm != vma->vm_mm) +		return true; + +	return false; +} + +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ +	return vma->vm_flags & VM_ACCESS_FLAGS; +} +  #ifdef CONFIG_SHMEM  /*   * The vma_is_shmem is not inline because it is used only by slow @@ -770,6 +882,24 @@ static inline unsigned int compound_order(struct page *page)  	return page[1].compound_order;  } +static inline bool hpage_pincount_available(struct page *page) +{ +	/* +	 * Can the page->hpage_pinned_refcount field be used? That field is in +	 * the 3rd page of the compound page, so the smallest (2-page) compound +	 * pages cannot support it. +	 */ +	page = compound_head(page); +	return PageCompound(page) && compound_order(page) > 1; +} + +static inline int compound_pincount(struct page *page) +{ +	VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); +	page = compound_head(page); +	return atomic_read(compound_pincount_ptr(page)); +} +  static inline void set_compound_order(struct page *page, unsigned int order)  {  	page[1].compound_order = order; @@ -1001,6 +1131,8 @@ static inline void get_page(struct page *page)  	page_ref_inc(page);  } +bool __must_check try_grab_page(struct page *page, unsigned int flags); +  static inline __must_check bool try_get_page(struct page *page)  {  	page = compound_head(page); @@ -1029,29 +1161,87 @@ static inline void put_page(struct page *page)  		__put_page(page);  } -/** - * unpin_user_page() - release a gup-pinned page - * @page:            pointer to page to be released +/* + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload + * the page's refcount so that two separate items are tracked: the original page + * reference count, and also a new count of how many pin_user_pages() calls were + * made against the page. ("gup-pinned" is another term for the latter). + * + * With this scheme, pin_user_pages() becomes special: such pages are marked as + * distinct from normal pages. As such, the unpin_user_page() call (and its + * variants) must be used in order to release gup-pinned pages. + * + * Choice of value: + * + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference + * counts with respect to pin_user_pages() and unpin_user_page() becomes + * simpler, due to the fact that adding an even power of two to the page + * refcount has the effect of using only the upper N bits, for the code that + * counts up using the bias value. This means that the lower bits are left for + * the exclusive use of the original code that increments and decrements by one + * (or at least, by much smaller values than the bias value).   * - * Pages that were pinned via pin_user_pages*() must be released via either - * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so - * that eventually such pages can be separately tracked and uniquely handled. In - * particular, interactions with RDMA and filesystems need special handling. + * Of course, once the lower bits overflow into the upper bits (and this is + * OK, because subtraction recovers the original values), then visual inspection + * no longer suffices to directly view the separate counts. However, for normal + * applications that don't have huge page reference counts, this won't be an + * issue.   * - * unpin_user_page() and put_page() are not interchangeable, despite this early - * implementation that makes them look the same. unpin_user_page() calls must - * be perfectly matched up with pin*() calls. + * Locking: the lockless algorithm described in page_cache_get_speculative() + * and page_cache_gup_pin_speculative() provides safe operation for + * get_user_pages and page_mkclean and other calls that race to set up page + * table entries.   */ -static inline void unpin_user_page(struct page *page) -{ -	put_page(page); -} +#define GUP_PIN_COUNTING_BIAS (1U << 10) +void unpin_user_page(struct page *page);  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,  				 bool make_dirty); -  void unpin_user_pages(struct page **pages, unsigned long npages); +/** + * page_maybe_dma_pinned() - report if a page is pinned for DMA. + * + * This function checks if a page has been pinned via a call to + * pin_user_pages*(). + * + * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal page references". + * + * False positives are OK, because: a) it's unlikely for a page to get that many + * refcounts, and b) all the callers of this routine are expected to be able to + * deal gracefully with a false positive. + * + * For huge pages, the result will be exactly correct. That's because we have + * more tracking data available: the 3rd struct page in the compound page is + * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS + * scheme). + * + * For more information, please see Documentation/vm/pin_user_pages.rst. + * + * @page:	pointer to page to be queried. + * @Return:	True, if it is likely that the page has been "dma-pinned". + *		False, if the page is definitely not dma-pinned. + */ +static inline bool page_maybe_dma_pinned(struct page *page) +{ +	if (hpage_pincount_available(page)) +		return compound_pincount(page) > 0; + +	/* +	 * page_ref_count() is signed. If that refcount overflows, then +	 * page_ref_count() returns a negative value, and callers will avoid +	 * further incrementing the refcount. +	 * +	 * Here, for that overflow case, use the signed bit to count a little +	 * bit higher via unsigned math, and thus still get an accurate result. +	 */ +	return ((unsigned int)page_ref_count(compound_head(page))) >= +		GUP_PIN_COUNTING_BIAS; +} +  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)  #define SECTION_IN_PAGE_FLAGS  #endif @@ -1599,9 +1789,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,  		unsigned long old_addr, struct vm_area_struct *new_vma,  		unsigned long new_addr, unsigned long len,  		bool need_rmap_locks); + +/* + * Flags used by change_protection().  For now we make it a bitmap so + * that we can pass in multiple flags just like parameters.  However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* Whether we should allow dirty bit accounting */ +#define  MM_CP_DIRTY_ACCT                  (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define  MM_CP_PROT_NUMA                   (1UL << 1) +/* Whether this change is for write protecting */ +#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */ +#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */ +#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \ +					    MM_CP_UFFD_WP_RESOLVE) +  extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,  			      unsigned long end, pgprot_t newprot, -			      int dirty_accountable, int prot_numa); +			      unsigned long cp_flags);  extern int mprotect_fixup(struct vm_area_struct *vma,  			  struct vm_area_struct **pprev, unsigned long start,  			  unsigned long end, unsigned long newflags); @@ -1720,6 +1927,18 @@ static inline void sync_mm_rss(struct mm_struct *mm)  }  #endif +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL +static inline int pte_special(pte_t pte) +{ +	return 0; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ +	return pte; +} +#endif +  #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP  static inline int pte_devmap(pte_t pte)  { @@ -2364,26 +2583,7 @@ struct vm_unmapped_area_info {  	unsigned long align_offset;  }; -extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); -extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); - -/* - * Search for an unmapped address range. - * - * We are looking for a range that: - * - does not intersect with any VMA; - * - is contained within the [low_limit, high_limit) interval; - * - is at least the desired size. - * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) - */ -static inline unsigned long -vm_unmapped_area(struct vm_unmapped_area_info *info) -{ -	if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) -		return unmapped_area_topdown(info); -	else -		return unmapped_area(info); -} +extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);  /* truncate.c */  extern void truncate_inode_pages(struct address_space *, loff_t); @@ -2519,6 +2719,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);  int remap_pfn_range(struct vm_area_struct *, unsigned long addr,  			unsigned long pfn, unsigned long size, pgprot_t);  int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, +			struct page **pages, unsigned long *num);  int vm_map_pages(struct vm_area_struct *vma, struct page **pages,  				unsigned long num);  int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, @@ -2867,6 +3069,23 @@ extern long copy_huge_page_from_user(struct page *dst_page,  				const void __user *usr_src,  				unsigned int pages_per_huge_page,  				bool allow_pagefault); + +/** + * vma_is_special_huge - Are transhuge page-table entries considered special? + * @vma: Pointer to the struct vm_area_struct to consider + * + * Whether transhuge page-table entries are considered "special" following + * the definition in vm_normal_page(). + * + * Return: true if transhuge page-table entries should be considered special, + * false otherwise. + */ +static inline bool vma_is_special_huge(const struct vm_area_struct *vma) +{ +	return vma_is_dax(vma) || (vma->vm_file && +				   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); +} +  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */  #ifdef CONFIG_DEBUG_PAGEALLOC  |