diff options
Diffstat (limited to 'include/linux/mm.h')
| -rw-r--r-- | include/linux/mm.h | 587 | 
1 files changed, 302 insertions, 285 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index f3f196e4d66d..1f79667824eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -137,7 +137,7 @@ extern int mmap_rnd_compat_bits __read_mostly;   * define their own version of this macro in <asm/pgtable.h>   */  #if BITS_PER_LONG == 64 -/* This function must be updated when the size of struct page grows above 80 +/* This function must be updated when the size of struct page grows above 96   * or reduces below 56. The idea that compiler optimizes out switch()   * statement, and only leaves move/store instructions. Also the compiler can   * combine write statements if they are both assignments and can be reordered, @@ -148,12 +148,18 @@ static inline void __mm_zero_struct_page(struct page *page)  {  	unsigned long *_pp = (void *)page; -	 /* Check that struct page is either 56, 64, 72, or 80 bytes */ +	 /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */  	BUILD_BUG_ON(sizeof(struct page) & 7);  	BUILD_BUG_ON(sizeof(struct page) < 56); -	BUILD_BUG_ON(sizeof(struct page) > 80); +	BUILD_BUG_ON(sizeof(struct page) > 96);  	switch (sizeof(struct page)) { +	case 96: +		_pp[11] = 0; +		fallthrough; +	case 88: +		_pp[10] = 0; +		fallthrough;  	case 80:  		_pp[9] = 0;  		fallthrough; @@ -276,7 +282,12 @@ extern unsigned int kobjsize(const void *objp);  #define VM_MAYSHARE	0x00000080  #define VM_GROWSDOWN	0x00000100	/* general info on the segment */ +#ifdef CONFIG_MMU  #define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */ +#else /* CONFIG_MMU */ +#define VM_MAYOVERLAY	0x00000200	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ +#define VM_UFFD_MISSING	0 +#endif /* CONFIG_MMU */  #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */  #define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */ @@ -416,8 +427,8 @@ extern unsigned int kobjsize(const void *objp);  /* This mask defines which mm->def_flags a process can inherit its parent */  #define VM_INIT_DEF_MASK	VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK	(~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)  /* Arch-specific flags to clear when updating VM flags on protection change */  #ifndef VM_ARCH_CLEAR @@ -622,6 +633,63 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)  	INIT_LIST_HEAD(&vma->anon_vma_chain);  } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, +				 vm_flags_t flags) +{ +	ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, +				  vm_flags_t flags) +{ +	mmap_assert_write_locked(vma->vm_mm); +	vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, +				       vm_flags_t flags) +{ +	mmap_assert_write_locked(vma->vm_mm); +	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, +				vm_flags_t flags) +{ +	mmap_assert_write_locked(vma->vm_mm); +	ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, +				  vm_flags_t flags) +{ +	mmap_assert_write_locked(vma->vm_mm); +	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, +				  vm_flags_t set, vm_flags_t clear) +{ +	vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, +				vm_flags_t set, vm_flags_t clear) +{ +	mmap_assert_write_locked(vma->vm_mm); +	__vm_flags_mod(vma, set, clear); +} +  static inline void vma_set_anonymous(struct vm_area_struct *vma)  {  	vma->vm_ops = NULL; @@ -665,16 +733,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)  static inline  struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)  { -	return mas_find(&vmi->mas, max); +	return mas_find(&vmi->mas, max - 1);  }  static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)  {  	/* -	 * Uses vma_find() to get the first VMA when the iterator starts. +	 * Uses mas_find() to get the first VMA when the iterator starts.  	 * Calling mas_next() could skip the first entry.  	 */ -	return vma_find(vmi, ULONG_MAX); +	return mas_find(&vmi->mas, ULONG_MAX);  }  static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -687,12 +755,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)  	return vmi->mas.index;  } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ +	return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, +				      unsigned long count) +{ +	return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ +	mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, +				      struct vm_area_struct *vma) +{ +	vmi->mas.index = vma->vm_start; +	vmi->mas.last = vma->vm_end - 1; +	mas_store(&vmi->mas, vma); +	if (unlikely(mas_is_err(&vmi->mas))) +		return -ENOMEM; + +	return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ +	mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ +	mas_set(&vmi->mas, addr); +} +  #define for_each_vma(__vmi, __vma)					\  	while (((__vma) = vma_next(&(__vmi))) != NULL)  /* The MM code likes to work with exclusive end addresses */  #define for_each_vma_range(__vmi, __vma, __end)				\ -	while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) +	while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)  #ifdef CONFIG_SHMEM  /* @@ -714,11 +820,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);  struct mmu_gather;  struct inode; +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work.  These callers should be + * prepared to handle wild return values.  For example, PG_head may be + * set before _folio_order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */  static inline unsigned int compound_order(struct page *page)  { -	if (!PageHead(page)) +	struct folio *folio = (struct folio *)page; + +	if (!test_bit(PG_head, &folio->flags))  		return 0; -	return page[1].compound_order; +	return folio->_folio_order;  }  /** @@ -777,6 +892,13 @@ static inline bool get_page_unless_zero(struct page *page)  	return page_ref_add_unless(page, 1, 0);  } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ +	if (unlikely(!get_page_unless_zero(page))) +		return NULL; +	return (struct folio *)page; +} +  extern int page_is_ram(unsigned long pfn);  enum { @@ -826,34 +948,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)  static inline int folio_entire_mapcount(struct folio *folio)  {  	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); -	return atomic_read(folio_mapcount_ptr(folio)) + 1; -} - -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. - * Must be called only on head of compound page. - */ -static inline int head_compound_mapcount(struct page *head) -{ -	return atomic_read(compound_mapcount_ptr(head)) + 1; -} - -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED	0x800000 -#define SUBPAGES_MAPPED	(COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ -	return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; +	return atomic_read(&folio->_entire_mapcount) + 1;  }  /* @@ -866,25 +961,29 @@ static inline void page_mapcount_reset(struct page *page)  	atomic_set(&(page)->_mapcount, -1);  } -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount of compound_head of page. +/** + * page_mapcount() - Number of times this precise page is mapped. + * @page: The page. + * + * The number of times this page is mapped.  If this page is part of + * a large folio, it includes the number of times this page is mapped + * as part of that folio.   * - * Result is undefined for pages which cannot be mapped into userspace. + * The result is undefined for pages which cannot be mapped into userspace.   * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * They use this field in struct page differently.   */  static inline int page_mapcount(struct page *page)  {  	int mapcount = atomic_read(&page->_mapcount) + 1; -	if (likely(!PageCompound(page))) -		return mapcount; -	page = compound_head(page); -	return head_compound_mapcount(page) + mapcount; +	if (unlikely(PageCompound(page))) +		mapcount += folio_entire_mapcount(page_folio(page)); + +	return mapcount;  } -int total_compound_mapcount(struct page *head); +int folio_total_mapcount(struct folio *folio);  /**   * folio_mapcount() - Calculate the number of mappings of this folio. @@ -901,24 +1000,24 @@ static inline int folio_mapcount(struct folio *folio)  {  	if (likely(!folio_test_large(folio)))  		return atomic_read(&folio->_mapcount) + 1; -	return total_compound_mapcount(&folio->page); +	return folio_total_mapcount(folio);  }  static inline int total_mapcount(struct page *page)  {  	if (likely(!PageCompound(page)))  		return atomic_read(&page->_mapcount) + 1; -	return total_compound_mapcount(compound_head(page)); +	return folio_total_mapcount(page_folio(page));  }  static inline bool folio_large_is_mapped(struct folio *folio)  {  	/* -	 * Reading folio_mapcount_ptr() below could be omitted if hugetlb -	 * participated in incrementing subpages_mapcount when compound mapped. +	 * Reading _entire_mapcount below could be omitted if hugetlb +	 * participated in incrementing nr_pages_mapped when compound mapped.  	 */ -	return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || -		atomic_read(folio_mapcount_ptr(folio)) >= 0; +	return atomic_read(&folio->_nr_pages_mapped) > 0 || +		atomic_read(&folio->_entire_mapcount) >= 0;  }  /** @@ -993,8 +1092,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];  static inline void set_compound_page_dtor(struct page *page,  		enum compound_dtor_id compound_dtor)  { +	struct folio *folio = (struct folio *)page; +  	VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); -	page[1].compound_dtor = compound_dtor; +	VM_BUG_ON_PAGE(!PageHead(page), page); +	folio->_folio_dtor = compound_dtor;  }  static inline void folio_set_compound_dtor(struct folio *folio, @@ -1006,44 +1108,13 @@ static inline void folio_set_compound_dtor(struct folio *folio,  void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ -	return atomic_read(compound_pincount_ptr(head)); -} -  static inline void set_compound_order(struct page *page, unsigned int order)  { -	page[1].compound_order = order; -#ifdef CONFIG_64BIT -	page[1].compound_nr = 1U << order; -#endif -} - -/* - * folio_set_compound_order is generally passed a non-zero order to - * initialize a large folio.  However, hugetlb code abuses this by - * passing in zero when 'dissolving' a large folio. - */ -static inline void folio_set_compound_order(struct folio *folio, -		unsigned int order) -{ -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); +	struct folio *folio = (struct folio *)page;  	folio->_folio_order = order;  #ifdef CONFIG_64BIT -	folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} - -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ -	if (!PageHead(page)) -		return 1; -#ifdef CONFIG_64BIT -	return page[1].compound_nr; -#else -	return 1UL << compound_order(page); +	folio->_folio_nr_pages = 1U << order;  #endif  } @@ -1070,16 +1141,6 @@ static inline unsigned int thp_order(struct page *page)  }  /** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ -	VM_BUG_ON_PGFLAGS(PageTail(page), page); -	return compound_nr(page); -} - -/**   * thp_size - Size of a transparent huge page.   * @page: Head page of a transparent huge page.   * @@ -1220,8 +1281,6 @@ static inline void get_page(struct page *page)  	folio_get(page_folio(page));  } -int __must_check try_grab_page(struct page *page, unsigned int flags); -  static inline __must_check bool try_get_page(struct page *page)  {  	page = compound_head(page); @@ -1270,10 +1329,10 @@ static inline void folio_put_refs(struct folio *folio, int refs)  		__folio_put(folio);  } -/** - * release_pages - release an array of pages or folios +/* + * union release_pages_arg - an array of pages or folios   * - * This just releases a simple array of multiple pages, and + * release_pages() releases a simple array of multiple pages, and   * accepts various different forms of said page array: either   * a regular old boring array of pages, an array of folios, or   * an array of encoded page pointers. @@ -1363,6 +1422,21 @@ static inline bool is_cow_mapping(vm_flags_t flags)  	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;  } +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ +	/* +	 * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected +	 * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of +	 * a file mapping. R/O MAP_PRIVATE mappings might still modify +	 * underlying memory if ptrace is active, so this is only possible if +	 * ptrace does not apply. Note that there is no mprotect() to upgrade +	 * write permissions later. +	 */ +	return flags & (VM_MAYSHARE | VM_MAYOVERLAY); +} +#endif +  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)  #define SECTION_IN_PAGE_FLAGS  #endif @@ -1637,11 +1711,6 @@ static inline struct folio *pfn_folio(unsigned long pfn)  	return page_folio(pfn_to_page(pfn));  } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ -	return &folio_page(folio, 1)->compound_pincount; -} -  /**   * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.   * @folio: The folio. @@ -1659,7 +1728,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio)   * expected to be able to deal gracefully with a false positive.   *   * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used   * instead of the GUP_PIN_COUNTING_BIAS scheme.   *   * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1670,7 +1739,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio)  static inline bool folio_maybe_dma_pinned(struct folio *folio)  {  	if (folio_test_large(folio)) -		return atomic_read(folio_pincount_ptr(folio)) > 0; +		return atomic_read(&folio->_pincount) > 0;  	/*  	 * folio_ref_count() is signed. If that refcount overflows, then @@ -1778,6 +1847,33 @@ static inline long folio_nr_pages(struct folio *folio)  #endif  } +/* + * compound_nr() returns the number of pages in this potentially compound + * page.  compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(struct page *page) +{ +	struct folio *folio = (struct folio *)page; + +	if (!test_bit(PG_head, &folio->flags)) +		return 1; +#ifdef CONFIG_64BIT +	return folio->_folio_nr_pages; +#else +	return 1L << folio->_folio_order; +#endif +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ +	return folio_nr_pages((struct folio *)page); +} +  /**   * folio_next - Move to the next physical folio.   * @folio: The folio we're currently operating on. @@ -1827,6 +1923,24 @@ static inline size_t folio_size(struct folio *folio)  	return PAGE_SIZE << folio_order(folio);  } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ +	return page_mapcount(folio_page(folio, 0)); +} +  #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE  static inline int arch_make_page_accessible(struct page *page)  { @@ -1923,6 +2037,21 @@ static inline bool page_is_pfmemalloc(const struct page *page)  }  /* + * Return true only if the folio has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool folio_is_pfmemalloc(const struct folio *folio) +{ +	/* +	 * lru.next has bit 1 set if the page is allocated from the +	 * pfmemalloc reserves.  Callers may simply overwrite it if +	 * they do not need to preserve that information. +	 */ +	return (uintptr_t)folio->lru.next & BIT(1); +} + +/*   * Only to be called by the page allocator on a freshly allocated   * page.   */ @@ -1976,6 +2105,31 @@ struct zap_details {  /* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */  #define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1)) +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ +	return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ +	/* +	 * Use the processor id as a fall-back when the mm cid feature is +	 * disabled. This provides functional per-cpu data structure accesses +	 * in user-space, althrough it won't provide the memory usage benefits. +	 */ +	return raw_smp_processor_id(); +} +#endif +  #ifdef CONFIG_MMU  extern bool can_do_mlock(void);  #else @@ -1984,6 +2138,8 @@ static inline bool can_do_mlock(void) { return false; }  extern int user_shm_lock(size_t, struct ucounts *);  extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, +			     pte_t pte);  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,  			     pte_t pte);  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1991,13 +2147,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,  void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,  		  unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, -		    unsigned long size);  void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,  			   unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ +	zap_page_range_single(vma, vma->vm_start, +			      vma->vm_end - vma->vm_start, NULL); +}  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,  		struct vm_area_struct *start_vma, unsigned long start, -		unsigned long end); +		unsigned long end, bool mm_wr_locked);  struct mmu_notifier_range; @@ -2095,8 +2254,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,  			struct task_struct *task, bool bypass_rlim);  struct kvec; -int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, -			struct page **pages);  struct page *get_dump_page(unsigned long addr);  bool folio_mark_dirty(struct folio *folio); @@ -2146,21 +2303,18 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma  }  bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,  			     pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb,  			      struct vm_area_struct *vma, unsigned long start, -			      unsigned long end, pgprot_t newprot, -			      unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, -			  struct vm_area_struct **pprev, unsigned long start, -			  unsigned long end, unsigned long newflags); +			      unsigned long end, unsigned long cp_flags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, +	  struct vm_area_struct *vma, struct vm_area_struct **pprev, +	  unsigned long start, unsigned long end, unsigned long newflags);  /*   * doesn't attempt to fault and will return short.   */  int get_user_pages_fast_only(unsigned long start, int nr_pages,  			     unsigned int gup_flags, struct page **pages); -int pin_user_pages_fast_only(unsigned long start, int nr_pages, -			     unsigned int gup_flags, struct page **pages);  static inline bool get_user_page_fast_only(unsigned long addr,  			unsigned int gup_flags, struct page **pagep) @@ -2784,23 +2938,21 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);  /* mmap.c */  extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, -	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, -	struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, -	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ -	return __vma_adjust(vma, start, end, pgoff, insert, NULL); -} -extern struct vm_area_struct *vma_merge(struct mm_struct *, -	struct vm_area_struct *prev, unsigned long addr, unsigned long end, -	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, -	struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, +		      unsigned long start, unsigned long end, pgoff_t pgoff, +		      struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, +		       unsigned long start, unsigned long end, pgoff_t pgoff); +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, +	struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, +	unsigned long end, unsigned long vm_flags, struct anon_vma *, +	struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, +	struct anon_vma_name *);  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, -	unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, -	unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, +		       unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, +			 unsigned long addr, int new_below);  extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);  extern void unlink_file_vma(struct vm_area_struct *);  extern struct vm_area_struct *copy_vma(struct vm_area_struct **, @@ -2808,9 +2960,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,  	bool *need_rmap_locks);  extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); -  static inline int check_data_rlimit(unsigned long rlim,  				    unsigned long new,  				    unsigned long start, @@ -2858,7 +3007,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,  extern unsigned long do_mmap(struct file *file, unsigned long addr,  	unsigned long len, unsigned long prot, unsigned long flags,  	unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,  			 unsigned long start, size_t len, struct list_head *uf,  			 bool downgrade);  extern int do_munmap(struct mm_struct *, unsigned long, size_t, @@ -2866,6 +3015,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t,  extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);  #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, +			 unsigned long start, unsigned long end, +			 struct list_head *uf, bool downgrade);  extern int __mm_populate(unsigned long addr, unsigned long len,  			 int ignore_errors);  static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3071,81 +3223,6 @@ static inline vm_fault_t vmf_error(int err)  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,  			 unsigned int foll_flags); -#define FOLL_WRITE	0x01	/* check pte is writable */ -#define FOLL_TOUCH	0x02	/* mark page accessed */ -#define FOLL_GET	0x04	/* do get_page on page */ -#define FOLL_DUMP	0x08	/* give error on hole if it would be zero */ -#define FOLL_FORCE	0x10	/* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT	0x20	/* if a disk transfer is needed, start the IO -				 * and return without waiting upon it */ -#define FOLL_NOFAULT	0x80	/* do not fault in pages */ -#define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */ -#define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */ -#define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */ -#define FOLL_ANON	0x8000	/* don't do file mappings */ -#define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */ -#define FOLL_PIN	0x40000	/* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY	0x80000	/* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA	0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE  0x200000 /* allow interrupts from generic signals */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control.  This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem.  Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed.  Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed.  This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region.  And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. - */ -  static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)  {  	if (vm_fault & VM_FAULT_OOM) @@ -3158,71 +3235,6 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)  }  /* - * Indicates for which pages that are write-protected in the page table, - * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the - * GUP pin will remain consistent with the pages mapped into the page tables - * of the MM. - * - * Temporary unmapping of PageAnonExclusive() pages or clearing of - * PageAnonExclusive() has to protect against concurrent GUP: - * * Ordinary GUP: Using the PT lock - * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): see - *    page_try_share_anon_rmap() - * - * Must be called with the (sub)page that's actually referenced via the - * page table entry, which might not necessarily be the head page for a - * PTE-mapped THP. - * - * If the vma is NULL, we're coming from the GUP-fast path and might have - * to fallback to the slow path just to lookup the vma. - */ -static inline bool gup_must_unshare(struct vm_area_struct *vma, -				    unsigned int flags, struct page *page) -{ -	/* -	 * FOLL_WRITE is implicitly handled correctly as the page table entry -	 * has to be writable -- and if it references (part of) an anonymous -	 * folio, that part is required to be marked exclusive. -	 */ -	if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) -		return false; -	/* -	 * Note: PageAnon(page) is stable until the page is actually getting -	 * freed. -	 */ -	if (!PageAnon(page)) { -		/* -		 * We only care about R/O long-term pining: R/O short-term -		 * pinning does not have the semantics to observe successive -		 * changes through the process page tables. -		 */ -		if (!(flags & FOLL_LONGTERM)) -			return false; - -		/* We really need the vma ... */ -		if (!vma) -			return true; - -		/* -		 * ... because we only care about writable private ("COW") -		 * mappings where we have to break COW early. -		 */ -		return is_cow_mapping(vma->vm_flags); -	} - -	/* Paired with a memory barrier in page_try_share_anon_rmap(). */ -	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) -		smp_rmb(); - -	/* -	 * Note that PageKsm() pages cannot be exclusive, and consequently, -	 * cannot get pinned. -	 */ -	return !PageAnonExclusive(page); -} - -/*   * Indicates whether GUP can follow a PROT_NONE mapped page, or whether   * a (NUMA hinting) fault is required.   */ @@ -3521,6 +3533,11 @@ enum mf_action_page_type {  	MF_MSG_UNKNOWN,  }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; +  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)  extern void clear_huge_page(struct page *page,  			    unsigned long addr_hint, @@ -3638,7 +3655,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)  		 * VM_MAYWRITE as we still want them to be COW-writable.  		 */  		if (vma->vm_flags & VM_SHARED) -			vma->vm_flags &= ~(VM_MAYWRITE); +			vm_flags_clear(vma, VM_MAYWRITE);  	}  	return 0;  |