diff options
Diffstat (limited to 'arch/s390/mm/pgalloc.c')
| -rw-r--r-- | arch/s390/mm/pgalloc.c | 244 | 
1 files changed, 176 insertions, 68 deletions
| diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 781965f7210e..fd35c1a0213b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)  #endif /* CONFIG_PGSTE */  /* - * page table entry allocation/free routines. + * A 2KB-pgtable is either upper or lower half of a normal page. + * The second half of the page may be unused or used as another + * 2KB-pgtable. + * + * Whenever possible the parent page for a new 2KB-pgtable is picked + * from the list of partially allocated pages mm_context_t::pgtable_list. + * In case the list is empty a new parent page is allocated and added to + * the list. + * + * When a parent page gets fully allocated it contains 2KB-pgtables in both + * upper and lower halves and is removed from mm_context_t::pgtable_list. + * + * When 2KB-pgtable is freed from to fully allocated parent page that + * page turns partially allocated and added to mm_context_t::pgtable_list. + * + * If 2KB-pgtable is freed from the partially allocated parent page that + * page turns unused and gets removed from mm_context_t::pgtable_list. + * Furthermore, the unused parent page is released. + * + * As follows from the above, no unallocated or fully allocated parent + * pages are contained in mm_context_t::pgtable_list. + * + * The upper byte (bits 24-31) of the parent page _refcount is used + * for tracking contained 2KB-pgtables and has the following format: + * + *   PP  AA + * 01234567    upper byte (bits 24-31) of struct page::_refcount + *   ||  || + *   ||  |+--- upper 2KB-pgtable is allocated + *   ||  +---- lower 2KB-pgtable is allocated + *   |+------- upper 2KB-pgtable is pending for removal + *   +-------- lower 2KB-pgtable is pending for removal + * + * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why + * using _refcount is possible). + * + * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. + * The parent page is either: + *   - added to mm_context_t::pgtable_list in case the second half of the + *     parent page is still unallocated; + *   - removed from mm_context_t::pgtable_list in case both hales of the + *     parent page are allocated; + * These operations are protected with mm_context_t::lock. + * + * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 + * and the corresponding PP bit is set to 1 in a single atomic operation. + * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually + * exclusive and may never be both set to 1! + * The parent page is either: + *   - added to mm_context_t::pgtable_list in case the second half of the + *     parent page is still allocated; + *   - removed from mm_context_t::pgtable_list in case the second half of + *     the parent page is unallocated; + * These operations are protected with mm_context_t::lock. + * + * It is important to understand that mm_context_t::lock only protects + * mm_context_t::pgtable_list and AA bits, but not the parent page itself + * and PP bits. + * + * Releasing the parent page happens whenever the PP bit turns from 1 to 0, + * while both AA bits and the second PP bit are already unset. Then the + * parent page does not contain any 2KB-pgtable fragment anymore, and it has + * also been removed from mm_context_t::pgtable_list. It is safe to release + * the page therefore. + * + * PGSTE memory spaces use full 4KB-pgtables and do not need most of the + * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable + * while the PP bits are never used, nor such a page is added to or removed + * from mm_context_t::pgtable_list.   */  unsigned long *page_table_alloc(struct mm_struct *mm)  { @@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)  			page = list_first_entry(&mm->context.pgtable_list,  						struct page, lru);  			mask = atomic_read(&page->_refcount) >> 24; -			mask = (mask | (mask >> 4)) & 3; -			if (mask != 3) { +			/* +			 * The pending removal bits must also be checked. +			 * Failure to do so might lead to an impossible +			 * value of (i.e 0x13 or 0x23) written to _refcount. +			 * Such values violate the assumption that pending and +			 * allocation bits are mutually exclusive, and the rest +			 * of the code unrails as result. That could lead to +			 * a whole bunch of races and corruptions. +			 */ +			mask = (mask | (mask >> 4)) & 0x03U; +			if (mask != 0x03U) {  				table = (unsigned long *) page_to_virt(page);  				bit = mask & 1;		/* =1 -> second 2K */  				if (bit)  					table += PTRS_PER_PTE;  				atomic_xor_bits(&page->_refcount, -							1U << (bit + 24)); +							0x01U << (bit + 24));  				list_del(&page->lru);  			}  		} @@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)  	table = (unsigned long *) page_to_virt(page);  	if (mm_alloc_pgste(mm)) {  		/* Return 4K page table with PGSTEs */ -		atomic_xor_bits(&page->_refcount, 3 << 24); +		atomic_xor_bits(&page->_refcount, 0x03U << 24);  		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);  		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);  	} else {  		/* Return the first 2K fragment of the page */ -		atomic_xor_bits(&page->_refcount, 1 << 24); +		atomic_xor_bits(&page->_refcount, 0x01U << 24);  		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);  		spin_lock_bh(&mm->context.lock);  		list_add(&page->lru, &mm->context.pgtable_list); @@ -234,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm)  	return table;  } +static void page_table_release_check(struct page *page, void *table, +				     unsigned int half, unsigned int mask) +{ +	char msg[128]; + +	if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) +		return; +	snprintf(msg, sizeof(msg), +		 "Invalid pgtable %p release half 0x%02x mask 0x%02x", +		 table, half, mask); +	dump_page(page, msg); +} +  void page_table_free(struct mm_struct *mm, unsigned long *table)  { +	unsigned int mask, bit, half;  	struct page *page; -	unsigned int bit, mask;  	page = virt_to_page(table);  	if (!mm_alloc_pgste(mm)) {  		/* Free 2K page table fragment of a 4K page */  		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));  		spin_lock_bh(&mm->context.lock); -		mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); +		/* +		 * Mark the page for delayed release. The actual release +		 * will happen outside of the critical section from this +		 * function or from __tlb_remove_table() +		 */ +		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));  		mask >>= 24; -		if (mask & 3) +		if (mask & 0x03U)  			list_add(&page->lru, &mm->context.pgtable_list);  		else  			list_del(&page->lru);  		spin_unlock_bh(&mm->context.lock); -		if (mask != 0) +		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); +		mask >>= 24; +		if (mask != 0x00U)  			return; +		half = 0x01U << bit;  	} else { -		atomic_xor_bits(&page->_refcount, 3U << 24); +		half = 0x03U; +		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); +		mask >>= 24;  	} +	page_table_release_check(page, table, half, mask);  	pgtable_pte_page_dtor(page);  	__free_page(page);  } @@ -272,47 +373,54 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,  	page = virt_to_page(table);  	if (mm_alloc_pgste(mm)) {  		gmap_unlink(mm, table, vmaddr); -		table = (unsigned long *) ((unsigned long)table | 3); +		table = (unsigned long *) ((unsigned long)table | 0x03U);  		tlb_remove_table(tlb, table);  		return;  	}  	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));  	spin_lock_bh(&mm->context.lock); +	/* +	 * Mark the page for delayed release. The actual release will happen +	 * outside of the critical section from __tlb_remove_table() or from +	 * page_table_free() +	 */  	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));  	mask >>= 24; -	if (mask & 3) +	if (mask & 0x03U)  		list_add_tail(&page->lru, &mm->context.pgtable_list);  	else  		list_del(&page->lru);  	spin_unlock_bh(&mm->context.lock); -	table = (unsigned long *) ((unsigned long) table | (1U << bit)); +	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));  	tlb_remove_table(tlb, table);  }  void __tlb_remove_table(void *_table)  { -	unsigned int mask = (unsigned long) _table & 3; +	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;  	void *table = (void *)((unsigned long) _table ^ mask);  	struct page *page = virt_to_page(table); -	switch (mask) { -	case 0:		/* pmd, pud, or p4d */ +	switch (half) { +	case 0x00U:	/* pmd, pud, or p4d */  		free_pages((unsigned long) table, 2); -		break; -	case 1:		/* lower 2K of a 4K page table */ -	case 2:		/* higher 2K of a 4K page table */ +		return; +	case 0x01U:	/* lower 2K of a 4K page table */ +	case 0x02U:	/* higher 2K of a 4K page table */  		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));  		mask >>= 24; -		if (mask != 0) -			break; -		fallthrough; -	case 3:		/* 4K page table with pgstes */ -		if (mask & 3) -			atomic_xor_bits(&page->_refcount, 3 << 24); -		pgtable_pte_page_dtor(page); -		__free_page(page); +		if (mask != 0x00U) +			return; +		break; +	case 0x03U:	/* 4K page table with pgstes */ +		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); +		mask >>= 24;  		break;  	} + +	page_table_release_check(page, table, half, mask); +	pgtable_pte_page_dtor(page); +	__free_page(page);  }  /* @@ -322,34 +430,34 @@ void __tlb_remove_table(void *_table)  static struct kmem_cache *base_pgt_cache; -static unsigned long base_pgt_alloc(void) +static unsigned long *base_pgt_alloc(void)  { -	u64 *table; +	unsigned long *table;  	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);  	if (table) -		memset64(table, _PAGE_INVALID, PTRS_PER_PTE); -	return (unsigned long) table; +		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); +	return table;  } -static void base_pgt_free(unsigned long table) +static void base_pgt_free(unsigned long *table)  { -	kmem_cache_free(base_pgt_cache, (void *) table); +	kmem_cache_free(base_pgt_cache, table);  } -static unsigned long base_crst_alloc(unsigned long val) +static unsigned long *base_crst_alloc(unsigned long val)  { -	unsigned long table; +	unsigned long *table; -	table =	 __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); +	table =	(unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);  	if (table) -		crst_table_init((unsigned long *)table, val); +		crst_table_init(table, val);  	return table;  } -static void base_crst_free(unsigned long table) +static void base_crst_free(unsigned long *table)  { -	free_pages(table, CRST_ALLOC_ORDER); +	free_pages((unsigned long)table, CRST_ALLOC_ORDER);  }  #define BASE_ADDR_END_FUNC(NAME, SIZE)					\ @@ -377,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address)  	return real;  } -static int base_page_walk(unsigned long origin, unsigned long addr, +static int base_page_walk(unsigned long *origin, unsigned long addr,  			  unsigned long end, int alloc)  {  	unsigned long *pte, next;  	if (!alloc)  		return 0; -	pte = (unsigned long *) origin; +	pte = origin;  	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;  	do {  		next = base_page_addr_end(addr, end); @@ -393,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,  	return 0;  } -static int base_segment_walk(unsigned long origin, unsigned long addr, +static int base_segment_walk(unsigned long *origin, unsigned long addr,  			     unsigned long end, int alloc)  { -	unsigned long *ste, next, table; +	unsigned long *ste, next, *table;  	int rc; -	ste = (unsigned long *) origin; +	ste = origin;  	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;  	do {  		next = base_segment_addr_end(addr, end); @@ -409,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,  			table = base_pgt_alloc();  			if (!table)  				return -ENOMEM; -			*ste = table | _SEGMENT_ENTRY; +			*ste = __pa(table) | _SEGMENT_ENTRY;  		} -		table = *ste & _SEGMENT_ENTRY_ORIGIN; +		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);  		rc = base_page_walk(table, addr, next, alloc);  		if (rc)  			return rc; @@ -422,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,  	return 0;  } -static int base_region3_walk(unsigned long origin, unsigned long addr, +static int base_region3_walk(unsigned long *origin, unsigned long addr,  			     unsigned long end, int alloc)  { -	unsigned long *rtte, next, table; +	unsigned long *rtte, next, *table;  	int rc; -	rtte = (unsigned long *) origin; +	rtte = origin;  	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;  	do {  		next = base_region3_addr_end(addr, end); @@ -438,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,  			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);  			if (!table)  				return -ENOMEM; -			*rtte = table | _REGION3_ENTRY; +			*rtte = __pa(table) | _REGION3_ENTRY;  		} -		table = *rtte & _REGION_ENTRY_ORIGIN; +		table = __va(*rtte & _REGION_ENTRY_ORIGIN);  		rc = base_segment_walk(table, addr, next, alloc);  		if (rc)  			return rc; @@ -450,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,  	return 0;  } -static int base_region2_walk(unsigned long origin, unsigned long addr, +static int base_region2_walk(unsigned long *origin, unsigned long addr,  			     unsigned long end, int alloc)  { -	unsigned long *rste, next, table; +	unsigned long *rste, next, *table;  	int rc; -	rste = (unsigned long *) origin; +	rste = origin;  	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;  	do {  		next = base_region2_addr_end(addr, end); @@ -466,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,  			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);  			if (!table)  				return -ENOMEM; -			*rste = table | _REGION2_ENTRY; +			*rste = __pa(table) | _REGION2_ENTRY;  		} -		table = *rste & _REGION_ENTRY_ORIGIN; +		table = __va(*rste & _REGION_ENTRY_ORIGIN);  		rc = base_region3_walk(table, addr, next, alloc);  		if (rc)  			return rc; @@ -478,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,  	return 0;  } -static int base_region1_walk(unsigned long origin, unsigned long addr, +static int base_region1_walk(unsigned long *origin, unsigned long addr,  			     unsigned long end, int alloc)  { -	unsigned long *rfte, next, table; +	unsigned long *rfte, next, *table;  	int rc; -	rfte = (unsigned long *) origin; +	rfte = origin;  	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;  	do {  		next = base_region1_addr_end(addr, end); @@ -494,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,  			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);  			if (!table)  				return -ENOMEM; -			*rfte = table | _REGION1_ENTRY; +			*rfte = __pa(table) | _REGION1_ENTRY;  		} -		table = *rfte & _REGION_ENTRY_ORIGIN; +		table = __va(*rfte & _REGION_ENTRY_ORIGIN);  		rc = base_region2_walk(table, addr, next, alloc);  		if (rc)  			return rc; @@ -515,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,   */  void base_asce_free(unsigned long asce)  { -	unsigned long table = asce & _ASCE_ORIGIN; +	unsigned long *table = __va(asce & _ASCE_ORIGIN);  	if (!asce)  		return; @@ -567,7 +675,7 @@ static int base_pgt_cache_init(void)   */  unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)  { -	unsigned long asce, table, end; +	unsigned long asce, *table, end;  	int rc;  	if (base_pgt_cache_init()) @@ -578,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)  		if (!table)  			return 0;  		rc = base_segment_walk(table, addr, end, 1); -		asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; +		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;  	} else if (end <= _REGION2_SIZE) {  		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);  		if (!table)  			return 0;  		rc = base_region3_walk(table, addr, end, 1); -		asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; +		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;  	} else if (end <= _REGION1_SIZE) {  		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);  		if (!table)  			return 0;  		rc = base_region2_walk(table, addr, end, 1); -		asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; +		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;  	} else {  		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);  		if (!table)  			return 0;  		rc = base_region1_walk(table, addr, end, 1); -		asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; +		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;  	}  	if (rc) {  		base_asce_free(asce); |