diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 288 | 
1 files changed, 91 insertions, 197 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b45a95363a84..ac65bb5e38ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -244,16 +244,66 @@ struct file_region {  	long to;  }; +/* Must be called with resv->lock held. Calling this with count_only == true + * will count the number of pages to be added but will not modify the linked + * list. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, +				     bool count_only) +{ +	long chg = 0; +	struct list_head *head = &resv->regions; +	struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + +	/* Locate the region we are before or in. */ +	list_for_each_entry(rg, head, link) +		if (f <= rg->to) +			break; + +	/* Round our left edge to the current segment if it encloses us. */ +	if (f > rg->from) +		f = rg->from; + +	chg = t - f; + +	/* Check for and consume any regions we now overlap with. */ +	nrg = rg; +	list_for_each_entry_safe(rg, trg, rg->link.prev, link) { +		if (&rg->link == head) +			break; +		if (rg->from > t) +			break; + +		/* We overlap with this area, if it extends further than +		 * us then we must extend ourselves.  Account for its +		 * existing reservation. +		 */ +		if (rg->to > t) { +			chg += rg->to - t; +			t = rg->to; +		} +		chg -= rg->to - rg->from; + +		if (!count_only && rg != nrg) { +			list_del(&rg->link); +			kfree(rg); +		} +	} + +	if (!count_only) { +		nrg->from = f; +		nrg->to = t; +	} + +	return chg; +} +  /*   * Add the huge page range represented by [f, t) to the reserve - * map.  In the normal case, existing regions will be expanded - * to accommodate the specified range.  Sufficient regions should - * exist for expansion due to the previous call to region_chg - * with the same range.  However, it is possible that region_del - * could have been called after region_chg and modifed the map - * in such a way that no region exists to be expanded.  In this - * case, pull a region descriptor from the cache associated with - * the map and use that for the new range. + * map.  Existing regions will be expanded to accommodate the specified + * range, or a region will be taken from the cache.  Sufficient regions + * must exist in the cache due to the previous call to region_chg with + * the same range.   *   * Return the number of new huge pages added to the map.  This   * number is greater than or equal to zero. @@ -261,7 +311,7 @@ struct file_region {  static long region_add(struct resv_map *resv, long f, long t)  {  	struct list_head *head = &resv->regions; -	struct file_region *rg, *nrg, *trg; +	struct file_region *rg, *nrg;  	long add = 0;  	spin_lock(&resv->lock); @@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t)  	/*  	 * If no region exists which can be expanded to include the -	 * specified range, the list must have been modified by an -	 * interleving call to region_del().  Pull a region descriptor -	 * from the cache and use it for this range. +	 * specified range, pull a region descriptor from the cache +	 * and use it for this range.  	 */  	if (&rg->link == head || t < rg->from) {  		VM_BUG_ON(resv->region_cache_count <= 0); @@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t)  		goto out_locked;  	} -	/* Round our left edge to the current segment if it encloses us. */ -	if (f > rg->from) -		f = rg->from; - -	/* Check for and consume any regions we now overlap with. */ -	nrg = rg; -	list_for_each_entry_safe(rg, trg, rg->link.prev, link) { -		if (&rg->link == head) -			break; -		if (rg->from > t) -			break; - -		/* If this area reaches higher then extend our area to -		 * include it completely.  If this is not the first area -		 * which we intend to reuse, free it. */ -		if (rg->to > t) -			t = rg->to; -		if (rg != nrg) { -			/* Decrement return value by the deleted range. -			 * Another range will span this area so that by -			 * end of routine add will be >= zero -			 */ -			add -= (rg->to - rg->from); -			list_del(&rg->link); -			kfree(rg); -		} -	} - -	add += (nrg->from - f);		/* Added to beginning of region */ -	nrg->from = f; -	add += t - nrg->to;		/* Added to end of region */ -	nrg->to = t; +	add = add_reservation_in_range(resv, f, t, false);  out_locked:  	resv->adds_in_progress--; @@ -339,15 +357,9 @@ out_locked:   * call to region_add that will actually modify the reserve   * map to add the specified range [f, t).  region_chg does   * not change the number of huge pages represented by the - * map.  However, if the existing regions in the map can not - * be expanded to represent the new range, a new file_region - * structure is added to the map as a placeholder.  This is - * so that the subsequent region_add call will have all the - * regions it needs and will not fail. - * - * Upon entry, region_chg will also examine the cache of region descriptors - * associated with the map.  If there are not enough descriptors cached, one - * will be allocated for the in progress add operation. + * map.  A new file_region structure is added to the cache + * as a placeholder, so that the subsequent region_add + * call will have all the regions it needs and will not fail.   *   * Returns the number of huge pages that need to be added to the existing   * reservation map for the range [f, t).  This number is greater or equal to @@ -356,11 +368,8 @@ out_locked:   */  static long region_chg(struct resv_map *resv, long f, long t)  { -	struct list_head *head = &resv->regions; -	struct file_region *rg, *nrg = NULL;  	long chg = 0; -retry:  	spin_lock(&resv->lock);  retry_locked:  	resv->adds_in_progress++; @@ -378,10 +387,8 @@ retry_locked:  		spin_unlock(&resv->lock);  		trg = kmalloc(sizeof(*trg), GFP_KERNEL); -		if (!trg) { -			kfree(nrg); +		if (!trg)  			return -ENOMEM; -		}  		spin_lock(&resv->lock);  		list_add(&trg->link, &resv->region_cache); @@ -389,61 +396,8 @@ retry_locked:  		goto retry_locked;  	} -	/* Locate the region we are before or in. */ -	list_for_each_entry(rg, head, link) -		if (f <= rg->to) -			break; +	chg = add_reservation_in_range(resv, f, t, true); -	/* If we are below the current region then a new region is required. -	 * Subtle, allocate a new region at the position but make it zero -	 * size such that we can guarantee to record the reservation. */ -	if (&rg->link == head || t < rg->from) { -		if (!nrg) { -			resv->adds_in_progress--; -			spin_unlock(&resv->lock); -			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); -			if (!nrg) -				return -ENOMEM; - -			nrg->from = f; -			nrg->to   = f; -			INIT_LIST_HEAD(&nrg->link); -			goto retry; -		} - -		list_add(&nrg->link, rg->link.prev); -		chg = t - f; -		goto out_nrg; -	} - -	/* Round our left edge to the current segment if it encloses us. */ -	if (f > rg->from) -		f = rg->from; -	chg = t - f; - -	/* Check for and consume any regions we now overlap with. */ -	list_for_each_entry(rg, rg->link.prev, link) { -		if (&rg->link == head) -			break; -		if (rg->from > t) -			goto out; - -		/* We overlap with this area, if it extends further than -		 * us then we must extend ourselves.  Account for its -		 * existing reservation. */ -		if (rg->to > t) { -			chg += rg->to - t; -			t = rg->to; -		} -		chg -= rg->to - rg->from; -	} - -out: -	spin_unlock(&resv->lock); -	/*  We already know we raced and no longer need the new region */ -	kfree(nrg); -	return chg; -out_nrg:  	spin_unlock(&resv->lock);  	return chg;  } @@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order)  }  #ifdef CONFIG_CONTIG_ALLOC -static int __alloc_gigantic_page(unsigned long start_pfn, -				unsigned long nr_pages, gfp_t gfp_mask) -{ -	unsigned long end_pfn = start_pfn + nr_pages; -	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, -				  gfp_mask); -} - -static bool pfn_range_valid_gigantic(struct zone *z, -			unsigned long start_pfn, unsigned long nr_pages) -{ -	unsigned long i, end_pfn = start_pfn + nr_pages; -	struct page *page; - -	for (i = start_pfn; i < end_pfn; i++) { -		page = pfn_to_online_page(i); -		if (!page) -			return false; - -		if (page_zone(page) != z) -			return false; - -		if (PageReserved(page)) -			return false; - -		if (page_count(page) > 0) -			return false; - -		if (PageHuge(page)) -			return false; -	} - -	return true; -} - -static bool zone_spans_last_pfn(const struct zone *zone, -			unsigned long start_pfn, unsigned long nr_pages) -{ -	unsigned long last_pfn = start_pfn + nr_pages - 1; -	return zone_spans_pfn(zone, last_pfn); -} -  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,  		int nid, nodemask_t *nodemask)  { -	unsigned int order = huge_page_order(h); -	unsigned long nr_pages = 1 << order; -	unsigned long ret, pfn, flags; -	struct zonelist *zonelist; -	struct zone *zone; -	struct zoneref *z; +	unsigned long nr_pages = 1UL << huge_page_order(h); -	zonelist = node_zonelist(nid, gfp_mask); -	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { -		spin_lock_irqsave(&zone->lock, flags); - -		pfn = ALIGN(zone->zone_start_pfn, nr_pages); -		while (zone_spans_last_pfn(zone, pfn, nr_pages)) { -			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { -				/* -				 * We release the zone lock here because -				 * alloc_contig_range() will also lock the zone -				 * at some point. If there's an allocation -				 * spinning on this lock, it may win the race -				 * and cause alloc_contig_range() to fail... -				 */ -				spin_unlock_irqrestore(&zone->lock, flags); -				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); -				if (!ret) -					return pfn_to_page(pfn); -				spin_lock_irqsave(&zone->lock, flags); -			} -			pfn += nr_pages; -		} - -		spin_unlock_irqrestore(&zone->lock, flags); -	} - -	return NULL; +	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);  }  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); @@ -3915,7 +3796,7 @@ retry:  			 * handling userfault.  Reacquire after handling  			 * fault to make calling code simpler.  			 */ -			hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); +			hash = hugetlb_fault_mutex_hash(mapping, idx);  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);  			ret = handle_userfault(&vmf, VM_UFFD_MISSING);  			mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4042,8 +3923,7 @@ backout_unlocked:  }  #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, -			    pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)  {  	unsigned long key[2];  	u32 hash; @@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,  	key[0] = (unsigned long) mapping;  	key[1] = idx; -	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); +	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);  	return hash & (num_fault_mutexes - 1);  } @@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,   * For uniprocesor systems we always use a single mutex, so just   * return 0 and avoid the hashing overhead.   */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, -			    pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)  {  	return 0;  } @@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	 * get spurious allocation failures if two CPUs race to instantiate  	 * the same page in the page cache.  	 */ -	hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); +	hash = hugetlb_fault_mutex_hash(mapping, idx);  	mutex_lock(&hugetlb_fault_mutex_table[hash]);  	entry = huge_ptep_get(ptep); @@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  				break;  			}  		} + +		/* +		 * If subpage information not requested, update counters +		 * and skip the same_page loop below. +		 */ +		if (!pages && !vmas && !pfn_offset && +		    (vaddr + huge_page_size(h) < vma->vm_end) && +		    (remainder >= pages_per_huge_page(h))) { +			vaddr += huge_page_size(h); +			remainder -= pages_per_huge_page(h); +			i += pages_per_huge_page(h); +			spin_unlock(ptl); +			continue; +		} +  same_page:  		if (pages) {  			pages[i] = mem_map_offset(page, pfn_offset); @@ -4842,7 +4736,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	if (!vma_shareable(vma, addr))  		return (pte_t *)pmd_alloc(mm, pud, addr); -	i_mmap_lock_write(mapping); +	i_mmap_lock_read(mapping);  	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {  		if (svma == vma)  			continue; @@ -4872,7 +4766,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	spin_unlock(ptl);  out:  	pte = (pte_t *)pmd_alloc(mm, pud, addr); -	i_mmap_unlock_write(mapping); +	i_mmap_unlock_read(mapping);  	return pte;  }  |