diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
| -rw-r--r-- | arch/x86/kvm/mmu.c | 1227 | 
1 files changed, 767 insertions, 460 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aee38623b768..1c5b69373a00 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -22,7 +22,6 @@  #include "mmu.h"  #include "x86.h"  #include "kvm_cache_regs.h" -#include "x86.h"  #include <linux/kvm_host.h>  #include <linux/types.h> @@ -148,7 +147,7 @@ module_param(oos_shadow, bool, 0644);  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \  			| PT64_NX_MASK) -#define RMAP_EXT 4 +#define PTE_LIST_EXT 4  #define ACC_EXEC_MASK    1  #define ACC_WRITE_MASK   PT_WRITABLE_MASK @@ -164,16 +163,16 @@ module_param(oos_shadow, bool, 0644);  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -struct kvm_rmap_desc { -	u64 *sptes[RMAP_EXT]; -	struct kvm_rmap_desc *more; +struct pte_list_desc { +	u64 *sptes[PTE_LIST_EXT]; +	struct pte_list_desc *more;  };  struct kvm_shadow_walk_iterator {  	u64 addr;  	hpa_t shadow_addr; -	int level;  	u64 *sptep; +	int level;  	unsigned index;  }; @@ -182,32 +181,68 @@ struct kvm_shadow_walk_iterator {  	     shadow_walk_okay(&(_walker));			\  	     shadow_walk_next(&(_walker))) -typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); +#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\ +	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\ +	     shadow_walk_okay(&(_walker)) &&				\ +		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\ +	     __shadow_walk_next(&(_walker), spte)) -static struct kmem_cache *pte_chain_cache; -static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *pte_list_desc_cache;  static struct kmem_cache *mmu_page_header_cache;  static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_trap_nonpresent_pte; -static u64 __read_mostly shadow_notrap_nonpresent_pte;  static u64 __read_mostly shadow_nx_mask;  static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */  static u64 __read_mostly shadow_user_mask;  static u64 __read_mostly shadow_accessed_mask;  static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mmio_mask; -static inline u64 rsvd_bits(int s, int e) +static void mmu_spte_set(u64 *sptep, u64 spte); + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)  { -	return ((1ULL << (e - s + 1)) - 1) << s; +	shadow_mmio_mask = mmio_mask;  } +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)  { -	shadow_trap_nonpresent_pte = trap_pte; -	shadow_notrap_nonpresent_pte = notrap_pte; +	access &= ACC_WRITE_MASK | ACC_USER_MASK; + +	trace_mark_mmio_spte(sptep, gfn, access); +	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); +} + +static bool is_mmio_spte(u64 spte) +{ +	return (spte & shadow_mmio_mask) == shadow_mmio_mask; +} + +static gfn_t get_mmio_spte_gfn(u64 spte) +{ +	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; +} + +static unsigned get_mmio_spte_access(u64 spte) +{ +	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; +} + +static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +{ +	if (unlikely(is_noslot_pfn(pfn))) { +		mark_mmio_spte(sptep, gfn, access); +		return true; +	} + +	return false; +} + +static inline u64 rsvd_bits(int s, int e) +{ +	return ((1ULL << (e - s + 1)) - 1) << s;  } -EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,  		u64 dirty_mask, u64 nx_mask, u64 x_mask) @@ -220,11 +255,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,  }  EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static bool is_write_protection(struct kvm_vcpu *vcpu) -{ -	return kvm_read_cr0_bits(vcpu, X86_CR0_WP); -} -  static int is_cpuid_PSE36(void)  {  	return 1; @@ -237,8 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)  static int is_shadow_present_pte(u64 pte)  { -	return pte != shadow_trap_nonpresent_pte -		&& pte != shadow_notrap_nonpresent_pte; +	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);  }  static int is_large_pte(u64 pte) @@ -246,11 +275,6 @@ static int is_large_pte(u64 pte)  	return pte & PT_PAGE_SIZE_MASK;  } -static int is_writable_pte(unsigned long pte) -{ -	return pte & PT_WRITABLE_MASK; -} -  static int is_dirty_gpte(unsigned long pte)  {  	return pte & PT_DIRTY_MASK; @@ -282,26 +306,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)  	return (gpte & PT32_DIR_PSE36_MASK) << shift;  } +#ifdef CONFIG_X86_64  static void __set_spte(u64 *sptep, u64 spte)  { -	set_64bit(sptep, spte); +	*sptep = spte;  } -static u64 __xchg_spte(u64 *sptep, u64 new_spte) +static void __update_clear_spte_fast(u64 *sptep, u64 spte)  { -#ifdef CONFIG_X86_64 -	return xchg(sptep, new_spte); +	*sptep = spte; +} + +static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) +{ +	return xchg(sptep, spte); +} + +static u64 __get_spte_lockless(u64 *sptep) +{ +	return ACCESS_ONCE(*sptep); +} + +static bool __check_direct_spte_mmio_pf(u64 spte) +{ +	/* It is valid if the spte is zapped. */ +	return spte == 0ull; +}  #else -	u64 old_spte; +union split_spte { +	struct { +		u32 spte_low; +		u32 spte_high; +	}; +	u64 spte; +}; -	do { -		old_spte = *sptep; -	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); +static void count_spte_clear(u64 *sptep, u64 spte) +{ +	struct kvm_mmu_page *sp =  page_header(__pa(sptep)); -	return old_spte; -#endif +	if (is_shadow_present_pte(spte)) +		return; + +	/* Ensure the spte is completely set before we increase the count */ +	smp_wmb(); +	sp->clear_spte_count++; +} + +static void __set_spte(u64 *sptep, u64 spte) +{ +	union split_spte *ssptep, sspte; + +	ssptep = (union split_spte *)sptep; +	sspte = (union split_spte)spte; + +	ssptep->spte_high = sspte.spte_high; + +	/* +	 * If we map the spte from nonpresent to present, We should store +	 * the high bits firstly, then set present bit, so cpu can not +	 * fetch this spte while we are setting the spte. +	 */ +	smp_wmb(); + +	ssptep->spte_low = sspte.spte_low;  } +static void __update_clear_spte_fast(u64 *sptep, u64 spte) +{ +	union split_spte *ssptep, sspte; + +	ssptep = (union split_spte *)sptep; +	sspte = (union split_spte)spte; + +	ssptep->spte_low = sspte.spte_low; + +	/* +	 * If we map the spte from present to nonpresent, we should clear +	 * present bit firstly to avoid vcpu fetch the old high bits. +	 */ +	smp_wmb(); + +	ssptep->spte_high = sspte.spte_high; +	count_spte_clear(sptep, spte); +} + +static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) +{ +	union split_spte *ssptep, sspte, orig; + +	ssptep = (union split_spte *)sptep; +	sspte = (union split_spte)spte; + +	/* xchg acts as a barrier before the setting of the high bits */ +	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); +	orig.spte_high = ssptep->spte_high = sspte.spte_high; +	count_spte_clear(sptep, spte); + +	return orig.spte; +} + +/* + * The idea using the light way get the spte on x86_32 guest is from + * gup_get_pte(arch/x86/mm/gup.c). + * The difference is we can not catch the spte tlb flush if we leave + * guest mode, so we emulate it by increase clear_spte_count when spte + * is cleared. + */ +static u64 __get_spte_lockless(u64 *sptep) +{ +	struct kvm_mmu_page *sp =  page_header(__pa(sptep)); +	union split_spte spte, *orig = (union split_spte *)sptep; +	int count; + +retry: +	count = sp->clear_spte_count; +	smp_rmb(); + +	spte.spte_low = orig->spte_low; +	smp_rmb(); + +	spte.spte_high = orig->spte_high; +	smp_rmb(); + +	if (unlikely(spte.spte_low != orig->spte_low || +	      count != sp->clear_spte_count)) +		goto retry; + +	return spte.spte; +} + +static bool __check_direct_spte_mmio_pf(u64 spte) +{ +	union split_spte sspte = (union split_spte)spte; +	u32 high_mmio_mask = shadow_mmio_mask >> 32; + +	/* It is valid if the spte is zapped. */ +	if (spte == 0ull) +		return true; + +	/* It is valid if the spte is being zapped. */ +	if (sspte.spte_low == 0ull && +	    (sspte.spte_high & high_mmio_mask) == high_mmio_mask) +		return true; + +	return false; +} +#endif +  static bool spte_has_volatile_bits(u64 spte)  {  	if (!shadow_accessed_mask) @@ -322,12 +474,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)  	return (old_spte & bit_mask) && !(new_spte & bit_mask);  } -static void update_spte(u64 *sptep, u64 new_spte) +/* Rules for using mmu_spte_set: + * Set the sptep from nonpresent to present. + * Note: the sptep being assigned *must* be either not present + * or in a state where the hardware will not attempt to update + * the spte. + */ +static void mmu_spte_set(u64 *sptep, u64 new_spte) +{ +	WARN_ON(is_shadow_present_pte(*sptep)); +	__set_spte(sptep, new_spte); +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changged. + */ +static void mmu_spte_update(u64 *sptep, u64 new_spte)  {  	u64 mask, old_spte = *sptep;  	WARN_ON(!is_rmap_spte(new_spte)); +	if (!is_shadow_present_pte(old_spte)) +		return mmu_spte_set(sptep, new_spte); +  	new_spte |= old_spte & shadow_dirty_mask;  	mask = shadow_accessed_mask; @@ -335,9 +505,9 @@ static void update_spte(u64 *sptep, u64 new_spte)  		mask |= shadow_dirty_mask;  	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) -		__set_spte(sptep, new_spte); +		__update_clear_spte_fast(sptep, new_spte);  	else -		old_spte = __xchg_spte(sptep, new_spte); +		old_spte = __update_clear_spte_slow(sptep, new_spte);  	if (!shadow_accessed_mask)  		return; @@ -348,6 +518,64 @@ static void update_spte(u64 *sptep, u64 new_spte)  		kvm_set_pfn_dirty(spte_to_pfn(old_spte));  } +/* + * Rules for using mmu_spte_clear_track_bits: + * It sets the sptep from present to nonpresent, and track the + * state bits, it is used to clear the last level sptep. + */ +static int mmu_spte_clear_track_bits(u64 *sptep) +{ +	pfn_t pfn; +	u64 old_spte = *sptep; + +	if (!spte_has_volatile_bits(old_spte)) +		__update_clear_spte_fast(sptep, 0ull); +	else +		old_spte = __update_clear_spte_slow(sptep, 0ull); + +	if (!is_rmap_spte(old_spte)) +		return 0; + +	pfn = spte_to_pfn(old_spte); +	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) +		kvm_set_pfn_accessed(pfn); +	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) +		kvm_set_pfn_dirty(pfn); +	return 1; +} + +/* + * Rules for using mmu_spte_clear_no_track: + * Directly clear spte without caring the state bits of sptep, + * it is used to set the upper level spte. + */ +static void mmu_spte_clear_no_track(u64 *sptep) +{ +	__update_clear_spte_fast(sptep, 0ull); +} + +static u64 mmu_spte_get_lockless(u64 *sptep) +{ +	return __get_spte_lockless(sptep); +} + +static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) +{ +	rcu_read_lock(); +	atomic_inc(&vcpu->kvm->arch.reader_counter); + +	/* Increase the counter before walking shadow page table */ +	smp_mb__after_atomic_inc(); +} + +static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) +{ +	/* Decrease the counter after walking shadow page table finished */ +	smp_mb__before_atomic_dec(); +	atomic_dec(&vcpu->kvm->arch.reader_counter); +	rcu_read_unlock(); +} +  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,  				  struct kmem_cache *base_cache, int min)  { @@ -397,12 +625,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)  {  	int r; -	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, -				   pte_chain_cache, 4); -	if (r) -		goto out; -	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, -				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM); +	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, +				   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);  	if (r)  		goto out;  	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -416,8 +640,8 @@ out:  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)  { -	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); -	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); +	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, +				pte_list_desc_cache);  	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);  	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,  				mmu_page_header_cache); @@ -433,26 +657,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,  	return p;  } -static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)  { -	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, -				      sizeof(struct kvm_pte_chain)); +	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, +				      sizeof(struct pte_list_desc));  } -static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)  { -	kmem_cache_free(pte_chain_cache, pc); -} - -static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) -{ -	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, -				      sizeof(struct kvm_rmap_desc)); -} - -static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) -{ -	kmem_cache_free(rmap_desc_cache, rd); +	kmem_cache_free(pte_list_desc_cache, pte_list_desc);  }  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) @@ -498,6 +711,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)  		linfo = lpage_info_slot(gfn, slot, i);  		linfo->write_count += 1;  	} +	kvm->arch.indirect_shadow_pages++;  }  static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) @@ -513,6 +727,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)  		linfo->write_count -= 1;  		WARN_ON(linfo->write_count < 0);  	} +	kvm->arch.indirect_shadow_pages--;  }  static int has_wrprotected_page(struct kvm *kvm, @@ -588,67 +803,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)  }  /* - * Take gfn and return the reverse mapping to it. - */ - -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) -{ -	struct kvm_memory_slot *slot; -	struct kvm_lpage_info *linfo; - -	slot = gfn_to_memslot(kvm, gfn); -	if (likely(level == PT_PAGE_TABLE_LEVEL)) -		return &slot->rmap[gfn - slot->base_gfn]; - -	linfo = lpage_info_slot(gfn, slot, level); - -	return &linfo->rmap_pde; -} - -/* - * Reverse mapping data structures: + * Pte mapping structures:   * - * If rmapp bit zero is zero, then rmapp point to the shadw page table entry - * that points to page_address(page). + * If pte_list bit zero is zero, then pte_list point to the spte.   * - * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc - * containing more mappings. + * If pte_list bit zero is one, (then pte_list & ~1) points to a struct + * pte_list_desc containing more mappings.   * - * Returns the number of rmap entries before the spte was added or zero if + * Returns the number of pte entries before the spte was added or zero if   * the spte was not added.   *   */ -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +			unsigned long *pte_list)  { -	struct kvm_mmu_page *sp; -	struct kvm_rmap_desc *desc; -	unsigned long *rmapp; +	struct pte_list_desc *desc;  	int i, count = 0; -	if (!is_rmap_spte(*spte)) -		return count; -	sp = page_header(__pa(spte)); -	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); -	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); -	if (!*rmapp) { -		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); -		*rmapp = (unsigned long)spte; -	} else if (!(*rmapp & 1)) { -		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); -		desc = mmu_alloc_rmap_desc(vcpu); -		desc->sptes[0] = (u64 *)*rmapp; +	if (!*pte_list) { +		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); +		*pte_list = (unsigned long)spte; +	} else if (!(*pte_list & 1)) { +		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); +		desc = mmu_alloc_pte_list_desc(vcpu); +		desc->sptes[0] = (u64 *)*pte_list;  		desc->sptes[1] = spte; -		*rmapp = (unsigned long)desc | 1; +		*pte_list = (unsigned long)desc | 1;  		++count;  	} else { -		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); -		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); -		while (desc->sptes[RMAP_EXT-1] && desc->more) { +		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); +		desc = (struct pte_list_desc *)(*pte_list & ~1ul); +		while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {  			desc = desc->more; -			count += RMAP_EXT; +			count += PTE_LIST_EXT;  		} -		if (desc->sptes[RMAP_EXT-1]) { -			desc->more = mmu_alloc_rmap_desc(vcpu); +		if (desc->sptes[PTE_LIST_EXT-1]) { +			desc->more = mmu_alloc_pte_list_desc(vcpu);  			desc = desc->more;  		}  		for (i = 0; desc->sptes[i]; ++i) @@ -658,59 +848,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)  	return count;  } -static void rmap_desc_remove_entry(unsigned long *rmapp, -				   struct kvm_rmap_desc *desc, -				   int i, -				   struct kvm_rmap_desc *prev_desc) +static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) +{ +	struct pte_list_desc *desc; +	u64 *prev_spte; +	int i; + +	if (!*pte_list) +		return NULL; +	else if (!(*pte_list & 1)) { +		if (!spte) +			return (u64 *)*pte_list; +		return NULL; +	} +	desc = (struct pte_list_desc *)(*pte_list & ~1ul); +	prev_spte = NULL; +	while (desc) { +		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { +			if (prev_spte == spte) +				return desc->sptes[i]; +			prev_spte = desc->sptes[i]; +		} +		desc = desc->more; +	} +	return NULL; +} + +static void +pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, +			   int i, struct pte_list_desc *prev_desc)  {  	int j; -	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) +	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)  		;  	desc->sptes[i] = desc->sptes[j];  	desc->sptes[j] = NULL;  	if (j != 0)  		return;  	if (!prev_desc && !desc->more) -		*rmapp = (unsigned long)desc->sptes[0]; +		*pte_list = (unsigned long)desc->sptes[0];  	else  		if (prev_desc)  			prev_desc->more = desc->more;  		else -			*rmapp = (unsigned long)desc->more | 1; -	mmu_free_rmap_desc(desc); +			*pte_list = (unsigned long)desc->more | 1; +	mmu_free_pte_list_desc(desc);  } -static void rmap_remove(struct kvm *kvm, u64 *spte) +static void pte_list_remove(u64 *spte, unsigned long *pte_list)  { -	struct kvm_rmap_desc *desc; -	struct kvm_rmap_desc *prev_desc; -	struct kvm_mmu_page *sp; -	gfn_t gfn; -	unsigned long *rmapp; +	struct pte_list_desc *desc; +	struct pte_list_desc *prev_desc;  	int i; -	sp = page_header(__pa(spte)); -	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); -	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); -	if (!*rmapp) { -		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); +	if (!*pte_list) { +		printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);  		BUG(); -	} else if (!(*rmapp & 1)) { -		rmap_printk("rmap_remove:  %p 1->0\n", spte); -		if ((u64 *)*rmapp != spte) { -			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte); +	} else if (!(*pte_list & 1)) { +		rmap_printk("pte_list_remove:  %p 1->0\n", spte); +		if ((u64 *)*pte_list != spte) { +			printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);  			BUG();  		} -		*rmapp = 0; +		*pte_list = 0;  	} else { -		rmap_printk("rmap_remove:  %p many->many\n", spte); -		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); +		rmap_printk("pte_list_remove:  %p many->many\n", spte); +		desc = (struct pte_list_desc *)(*pte_list & ~1ul);  		prev_desc = NULL;  		while (desc) { -			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) +			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)  				if (desc->sptes[i] == spte) { -					rmap_desc_remove_entry(rmapp, +					pte_list_desc_remove_entry(pte_list,  							       desc, i,  							       prev_desc);  					return; @@ -718,62 +927,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)  			prev_desc = desc;  			desc = desc->more;  		} -		pr_err("rmap_remove: %p many->many\n", spte); +		pr_err("pte_list_remove: %p many->many\n", spte);  		BUG();  	}  } -static int set_spte_track_bits(u64 *sptep, u64 new_spte) +typedef void (*pte_list_walk_fn) (u64 *spte); +static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)  { -	pfn_t pfn; -	u64 old_spte = *sptep; +	struct pte_list_desc *desc; +	int i; -	if (!spte_has_volatile_bits(old_spte)) -		__set_spte(sptep, new_spte); -	else -		old_spte = __xchg_spte(sptep, new_spte); +	if (!*pte_list) +		return; -	if (!is_rmap_spte(old_spte)) -		return 0; +	if (!(*pte_list & 1)) +		return fn((u64 *)*pte_list); -	pfn = spte_to_pfn(old_spte); -	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) -		kvm_set_pfn_accessed(pfn); -	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) -		kvm_set_pfn_dirty(pfn); -	return 1; +	desc = (struct pte_list_desc *)(*pte_list & ~1ul); +	while (desc) { +		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) +			fn(desc->sptes[i]); +		desc = desc->more; +	}  } -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +/* + * Take gfn and return the reverse mapping to it. + */ +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)  { -	if (set_spte_track_bits(sptep, new_spte)) -		rmap_remove(kvm, sptep); +	struct kvm_memory_slot *slot; +	struct kvm_lpage_info *linfo; + +	slot = gfn_to_memslot(kvm, gfn); +	if (likely(level == PT_PAGE_TABLE_LEVEL)) +		return &slot->rmap[gfn - slot->base_gfn]; + +	linfo = lpage_info_slot(gfn, slot, level); + +	return &linfo->rmap_pde; +} + +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ +	struct kvm_mmu_page *sp; +	unsigned long *rmapp; + +	sp = page_header(__pa(spte)); +	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); +	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); +	return pte_list_add(vcpu, spte, rmapp);  }  static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)  { -	struct kvm_rmap_desc *desc; -	u64 *prev_spte; -	int i; +	return pte_list_next(rmapp, spte); +} -	if (!*rmapp) -		return NULL; -	else if (!(*rmapp & 1)) { -		if (!spte) -			return (u64 *)*rmapp; -		return NULL; -	} -	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); -	prev_spte = NULL; -	while (desc) { -		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { -			if (prev_spte == spte) -				return desc->sptes[i]; -			prev_spte = desc->sptes[i]; -		} -		desc = desc->more; -	} -	return NULL; +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ +	struct kvm_mmu_page *sp; +	gfn_t gfn; +	unsigned long *rmapp; + +	sp = page_header(__pa(spte)); +	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); +	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); +	pte_list_remove(spte, rmapp); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep) +{ +	if (mmu_spte_clear_track_bits(sptep)) +		rmap_remove(kvm, sptep);  }  static int rmap_write_protect(struct kvm *kvm, u64 gfn) @@ -790,7 +1017,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  		BUG_ON(!(*spte & PT_PRESENT_MASK));  		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);  		if (is_writable_pte(*spte)) { -			update_spte(spte, *spte & ~PT_WRITABLE_MASK); +			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);  			write_protected = 1;  		}  		spte = rmap_next(kvm, rmapp, spte); @@ -807,8 +1034,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));  			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);  			if (is_writable_pte(*spte)) { -				drop_spte(kvm, spte, -					  shadow_trap_nonpresent_pte); +				drop_spte(kvm, spte);  				--kvm->stat.lpages;  				spte = NULL;  				write_protected = 1; @@ -829,7 +1055,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,  	while ((spte = rmap_next(kvm, rmapp, NULL))) {  		BUG_ON(!(*spte & PT_PRESENT_MASK));  		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); -		drop_spte(kvm, spte, shadow_trap_nonpresent_pte); +		drop_spte(kvm, spte);  		need_tlb_flush = 1;  	}  	return need_tlb_flush; @@ -851,7 +1077,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,  		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);  		need_flush = 1;  		if (pte_write(*ptep)) { -			drop_spte(kvm, spte, shadow_trap_nonpresent_pte); +			drop_spte(kvm, spte);  			spte = rmap_next(kvm, rmapp, NULL);  		} else {  			new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -860,7 +1086,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,  			new_spte &= ~PT_WRITABLE_MASK;  			new_spte &= ~SPTE_HOST_WRITEABLE;  			new_spte &= ~shadow_accessed_mask; -			set_spte_track_bits(spte, new_spte); +			mmu_spte_clear_track_bits(spte); +			mmu_spte_set(spte, new_spte);  			spte = rmap_next(kvm, rmapp, spte);  		}  	} @@ -1032,151 +1259,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)  	percpu_counter_add(&kvm_total_used_mmu_pages, nr);  } -static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +/* + * Remove the sp from shadow page cache, after call it, + * we can not find this sp from the cache, and the shadow + * page table is still valid. + * It should be under the protection of mmu lock. + */ +static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)  {  	ASSERT(is_empty_shadow_page(sp->spt));  	hlist_del(&sp->hash_link); -	list_del(&sp->link); -	free_page((unsigned long)sp->spt);  	if (!sp->role.direct)  		free_page((unsigned long)sp->gfns); -	kmem_cache_free(mmu_page_header_cache, sp); -	kvm_mod_used_mmu_pages(kvm, -1);  } -static unsigned kvm_page_table_hashfn(gfn_t gfn) +/* + * Free the shadow page table and the sp, we can do it + * out of the protection of mmu lock. + */ +static void kvm_mmu_free_page(struct kvm_mmu_page *sp)  { -	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); +	list_del(&sp->link); +	free_page((unsigned long)sp->spt); +	kmem_cache_free(mmu_page_header_cache, sp);  } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, -					       u64 *parent_pte, int direct) +static unsigned kvm_page_table_hashfn(gfn_t gfn)  { -	struct kvm_mmu_page *sp; - -	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); -	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); -	if (!direct) -		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, -						  PAGE_SIZE); -	set_page_private(virt_to_page(sp->spt), (unsigned long)sp); -	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); -	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); -	sp->multimapped = 0; -	sp->parent_pte = parent_pte; -	kvm_mod_used_mmu_pages(vcpu->kvm, +1); -	return sp; +	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);  }  static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,  				    struct kvm_mmu_page *sp, u64 *parent_pte)  { -	struct kvm_pte_chain *pte_chain; -	struct hlist_node *node; -	int i; -  	if (!parent_pte)  		return; -	if (!sp->multimapped) { -		u64 *old = sp->parent_pte; -		if (!old) { -			sp->parent_pte = parent_pte; -			return; -		} -		sp->multimapped = 1; -		pte_chain = mmu_alloc_pte_chain(vcpu); -		INIT_HLIST_HEAD(&sp->parent_ptes); -		hlist_add_head(&pte_chain->link, &sp->parent_ptes); -		pte_chain->parent_ptes[0] = old; -	} -	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { -		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) -			continue; -		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) -			if (!pte_chain->parent_ptes[i]) { -				pte_chain->parent_ptes[i] = parent_pte; -				return; -			} -	} -	pte_chain = mmu_alloc_pte_chain(vcpu); -	BUG_ON(!pte_chain); -	hlist_add_head(&pte_chain->link, &sp->parent_ptes); -	pte_chain->parent_ptes[0] = parent_pte; +	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);  }  static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,  				       u64 *parent_pte)  { -	struct kvm_pte_chain *pte_chain; -	struct hlist_node *node; -	int i; - -	if (!sp->multimapped) { -		BUG_ON(sp->parent_pte != parent_pte); -		sp->parent_pte = NULL; -		return; -	} -	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) -		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { -			if (!pte_chain->parent_ptes[i]) -				break; -			if (pte_chain->parent_ptes[i] != parent_pte) -				continue; -			while (i + 1 < NR_PTE_CHAIN_ENTRIES -				&& pte_chain->parent_ptes[i + 1]) { -				pte_chain->parent_ptes[i] -					= pte_chain->parent_ptes[i + 1]; -				++i; -			} -			pte_chain->parent_ptes[i] = NULL; -			if (i == 0) { -				hlist_del(&pte_chain->link); -				mmu_free_pte_chain(pte_chain); -				if (hlist_empty(&sp->parent_ptes)) { -					sp->multimapped = 0; -					sp->parent_pte = NULL; -				} -			} -			return; -		} -	BUG(); +	pte_list_remove(parent_pte, &sp->parent_ptes);  } -static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) +static void drop_parent_pte(struct kvm_mmu_page *sp, +			    u64 *parent_pte)  { -	struct kvm_pte_chain *pte_chain; -	struct hlist_node *node; -	struct kvm_mmu_page *parent_sp; -	int i; - -	if (!sp->multimapped && sp->parent_pte) { -		parent_sp = page_header(__pa(sp->parent_pte)); -		fn(parent_sp, sp->parent_pte); -		return; -	} - -	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) -		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { -			u64 *spte = pte_chain->parent_ptes[i]; +	mmu_page_remove_parent_pte(sp, parent_pte); +	mmu_spte_clear_no_track(parent_pte); +} -			if (!spte) -				break; -			parent_sp = page_header(__pa(spte)); -			fn(parent_sp, spte); -		} +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, +					       u64 *parent_pte, int direct) +{ +	struct kvm_mmu_page *sp; +	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, +					sizeof *sp); +	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); +	if (!direct) +		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, +						  PAGE_SIZE); +	set_page_private(virt_to_page(sp->spt), (unsigned long)sp); +	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); +	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); +	sp->parent_ptes = 0; +	mmu_page_add_parent_pte(vcpu, sp, parent_pte); +	kvm_mod_used_mmu_pages(vcpu->kvm, +1); +	return sp;  } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void mark_unsync(u64 *spte);  static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)  { -	mmu_parent_walk(sp, mark_unsync); +	pte_list_walk(&sp->parent_ptes, mark_unsync);  } -static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) +static void mark_unsync(u64 *spte)  { +	struct kvm_mmu_page *sp;  	unsigned int index; +	sp = page_header(__pa(spte));  	index = spte - sp->spt;  	if (__test_and_set_bit(index, sp->unsync_child_bitmap))  		return; @@ -1185,15 +1350,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)  	kvm_mmu_mark_parents_unsync(sp);  } -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, -				    struct kvm_mmu_page *sp) -{ -	int i; - -	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) -		sp->spt[i] = shadow_trap_nonpresent_pte; -} -  static int nonpaging_sync_page(struct kvm_vcpu *vcpu,  			       struct kvm_mmu_page *sp)  { @@ -1475,6 +1631,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,  	}  } +static void init_shadow_page_table(struct kvm_mmu_page *sp) +{ +	int i; + +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) +		sp->spt[i] = 0ull; +} +  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  					     gfn_t gfn,  					     gva_t gaddr, @@ -1537,10 +1701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  		account_shadowed(vcpu->kvm, gfn);  	} -	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) -		vcpu->arch.mmu.prefetch_page(vcpu, sp); -	else -		nonpaging_prefetch_page(vcpu, sp); +	init_shadow_page_table(sp);  	trace_kvm_mmu_get_page(sp, true);  	return sp;  } @@ -1572,21 +1733,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)  	if (iterator->level < PT_PAGE_TABLE_LEVEL)  		return false; -	if (iterator->level == PT_PAGE_TABLE_LEVEL) -		if (is_large_pte(*iterator->sptep)) -			return false; -  	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);  	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;  	return true;  } -static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, +			       u64 spte)  { -	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; +	if (is_last_spte(spte, iterator->level)) { +		iterator->level = 0; +		return; +	} + +	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;  	--iterator->level;  } +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ +	return __shadow_walk_next(iterator, *iterator->sptep); +} +  static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)  {  	u64 spte; @@ -1594,13 +1762,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)  	spte = __pa(sp->spt)  		| PT_PRESENT_MASK | PT_ACCESSED_MASK  		| PT_WRITABLE_MASK | PT_USER_MASK; -	__set_spte(sptep, spte); +	mmu_spte_set(sptep, spte);  }  static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)  {  	if (is_large_pte(*sptep)) { -		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); +		drop_spte(vcpu->kvm, sptep);  		kvm_flush_remote_tlbs(vcpu->kvm);  	}  } @@ -1622,38 +1790,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		if (child->role.access == direct_access)  			return; -		mmu_page_remove_parent_pte(child, sptep); -		__set_spte(sptep, shadow_trap_nonpresent_pte); +		drop_parent_pte(child, sptep);  		kvm_flush_remote_tlbs(vcpu->kvm);  	}  } +static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, +			     u64 *spte) +{ +	u64 pte; +	struct kvm_mmu_page *child; + +	pte = *spte; +	if (is_shadow_present_pte(pte)) { +		if (is_last_spte(pte, sp->role.level)) +			drop_spte(kvm, spte); +		else { +			child = page_header(pte & PT64_BASE_ADDR_MASK); +			drop_parent_pte(child, spte); +		} +	} else if (is_mmio_spte(pte)) +		mmu_spte_clear_no_track(spte); + +	if (is_large_pte(pte)) +		--kvm->stat.lpages; +} +  static void kvm_mmu_page_unlink_children(struct kvm *kvm,  					 struct kvm_mmu_page *sp)  {  	unsigned i; -	u64 *pt; -	u64 ent; - -	pt = sp->spt; - -	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { -		ent = pt[i]; - -		if (is_shadow_present_pte(ent)) { -			if (!is_last_spte(ent, sp->role.level)) { -				ent &= PT64_BASE_ADDR_MASK; -				mmu_page_remove_parent_pte(page_header(ent), -							   &pt[i]); -			} else { -				if (is_large_pte(ent)) -					--kvm->stat.lpages; -				drop_spte(kvm, &pt[i], -					  shadow_trap_nonpresent_pte); -			} -		} -		pt[i] = shadow_trap_nonpresent_pte; -	} + +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) +		mmu_page_zap_pte(kvm, sp, sp->spt + i);  }  static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -1674,20 +1843,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)  {  	u64 *parent_pte; -	while (sp->multimapped || sp->parent_pte) { -		if (!sp->multimapped) -			parent_pte = sp->parent_pte; -		else { -			struct kvm_pte_chain *chain; - -			chain = container_of(sp->parent_ptes.first, -					     struct kvm_pte_chain, link); -			parent_pte = chain->parent_ptes[0]; -		} -		BUG_ON(!parent_pte); -		kvm_mmu_put_page(sp, parent_pte); -		__set_spte(parent_pte, shadow_trap_nonpresent_pte); -	} +	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) +		drop_parent_pte(sp, parent_pte);  }  static int mmu_zap_unsync_children(struct kvm *kvm, @@ -1734,6 +1891,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  		/* Count self */  		ret++;  		list_move(&sp->link, invalid_list); +		kvm_mod_used_mmu_pages(kvm, -1);  	} else {  		list_move(&sp->link, &kvm->arch.active_mmu_pages);  		kvm_reload_remote_mmus(kvm); @@ -1744,6 +1902,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,  	return ret;  } +static void kvm_mmu_isolate_pages(struct list_head *invalid_list) +{ +	struct kvm_mmu_page *sp; + +	list_for_each_entry(sp, invalid_list, link) +		kvm_mmu_isolate_page(sp); +} + +static void free_pages_rcu(struct rcu_head *head) +{ +	struct kvm_mmu_page *next, *sp; + +	sp = container_of(head, struct kvm_mmu_page, rcu); +	while (sp) { +		if (!list_empty(&sp->link)) +			next = list_first_entry(&sp->link, +				      struct kvm_mmu_page, link); +		else +			next = NULL; +		kvm_mmu_free_page(sp); +		sp = next; +	} +} +  static void kvm_mmu_commit_zap_page(struct kvm *kvm,  				    struct list_head *invalid_list)  { @@ -1754,10 +1936,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,  	kvm_flush_remote_tlbs(kvm); +	if (atomic_read(&kvm->arch.reader_counter)) { +		kvm_mmu_isolate_pages(invalid_list); +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); +		list_del_init(invalid_list); + +		trace_kvm_mmu_delay_free_pages(sp); +		call_rcu(&sp->rcu, free_pages_rcu); +		return; +	} +  	do {  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);  		WARN_ON(!sp->role.invalid || sp->root_count); -		kvm_mmu_free_page(kvm, sp); +		kvm_mmu_isolate_page(sp); +		kvm_mmu_free_page(sp);  	} while (!list_empty(invalid_list));  } @@ -1783,8 +1976,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)  			page = container_of(kvm->arch.active_mmu_pages.prev,  					    struct kvm_mmu_page, link);  			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); -			kvm_mmu_commit_zap_page(kvm, &invalid_list);  		} +		kvm_mmu_commit_zap_page(kvm, &invalid_list);  		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;  	} @@ -1833,20 +2026,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)  	__set_bit(slot, sp->slot_bitmap);  } -static void mmu_convert_notrap(struct kvm_mmu_page *sp) -{ -	int i; -	u64 *pt = sp->spt; - -	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) -		return; - -	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { -		if (pt[i] == shadow_notrap_nonpresent_pte) -			__set_spte(&pt[i], shadow_trap_nonpresent_pte); -	} -} -  /*   * The function is based on mtrr_type_lookup() in   * arch/x86/kernel/cpu/mtrr/generic.c @@ -1959,7 +2138,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  	sp->unsync = 1;  	kvm_mmu_mark_parents_unsync(sp); -	mmu_convert_notrap(sp);  }  static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn) @@ -2002,13 +2180,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		    unsigned pte_access, int user_fault, -		    int write_fault, int dirty, int level, +		    int write_fault, int level,  		    gfn_t gfn, pfn_t pfn, bool speculative,  		    bool can_unsync, bool host_writable)  {  	u64 spte, entry = *sptep;  	int ret = 0; +	if (set_mmio_spte(sptep, gfn, pfn, pte_access)) +		return 0; +  	/*  	 * We don't set the accessed bit, since we sometimes want to see  	 * whether the guest actually used the pte (in order to detect @@ -2017,8 +2198,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	spte = PT_PRESENT_MASK;  	if (!speculative)  		spte |= shadow_accessed_mask; -	if (!dirty) -		pte_access &= ~ACC_WRITE_MASK; +  	if (pte_access & ACC_EXEC_MASK)  		spte |= shadow_x_mask;  	else @@ -2045,15 +2225,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		if (level > PT_PAGE_TABLE_LEVEL &&  		    has_wrprotected_page(vcpu->kvm, gfn, level)) {  			ret = 1; -			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); +			drop_spte(vcpu->kvm, sptep);  			goto done;  		}  		spte |= PT_WRITABLE_MASK;  		if (!vcpu->arch.mmu.direct_map -		    && !(pte_access & ACC_WRITE_MASK)) +		    && !(pte_access & ACC_WRITE_MASK)) {  			spte &= ~PT_USER_MASK; +			/* +			 * If we converted a user page to a kernel page, +			 * so that the kernel can write to it when cr0.wp=0, +			 * then we should prevent the kernel from executing it +			 * if SMEP is enabled. +			 */ +			if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) +				spte |= PT64_NX_MASK; +		}  		/*  		 * Optimization: for pte sync, if spte was writable the hash @@ -2078,7 +2267,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		mark_page_dirty(vcpu->kvm, gfn);  set_pte: -	update_spte(sptep, spte); +	mmu_spte_update(sptep, spte);  	/*  	 * If we overwrite a writable spte with a read-only one we  	 * should flush remote TLBs. Otherwise rmap_write_protect @@ -2093,8 +2282,8 @@ done:  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			 unsigned pt_access, unsigned pte_access, -			 int user_fault, int write_fault, int dirty, -			 int *ptwrite, int level, gfn_t gfn, +			 int user_fault, int write_fault, +			 int *emulate, int level, gfn_t gfn,  			 pfn_t pfn, bool speculative,  			 bool host_writable)  { @@ -2117,26 +2306,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			u64 pte = *sptep;  			child = page_header(pte & PT64_BASE_ADDR_MASK); -			mmu_page_remove_parent_pte(child, sptep); -			__set_spte(sptep, shadow_trap_nonpresent_pte); +			drop_parent_pte(child, sptep);  			kvm_flush_remote_tlbs(vcpu->kvm);  		} else if (pfn != spte_to_pfn(*sptep)) {  			pgprintk("hfn old %llx new %llx\n",  				 spte_to_pfn(*sptep), pfn); -			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); +			drop_spte(vcpu->kvm, sptep);  			kvm_flush_remote_tlbs(vcpu->kvm);  		} else  			was_rmapped = 1;  	}  	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, -		      dirty, level, gfn, pfn, speculative, true, +		      level, gfn, pfn, speculative, true,  		      host_writable)) {  		if (write_fault) -			*ptwrite = 1; +			*emulate = 1;  		kvm_mmu_flush_tlb(vcpu);  	} +	if (unlikely(is_mmio_spte(*sptep) && emulate)) +		*emulate = 1; +  	pgprintk("%s: setting spte %llx\n", __func__, *sptep);  	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",  		 is_large_pte(*sptep)? "2MB" : "4kB", @@ -2145,11 +2336,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	if (!was_rmapped && is_large_pte(*sptep))  		++vcpu->kvm->stat.lpages; -	page_header_update_slot(vcpu->kvm, sptep, gfn); -	if (!was_rmapped) { -		rmap_count = rmap_add(vcpu, sptep, gfn); -		if (rmap_count > RMAP_RECYCLE_THRESHOLD) -			rmap_recycle(vcpu, sptep, gfn); +	if (is_shadow_present_pte(*sptep)) { +		page_header_update_slot(vcpu->kvm, sptep, gfn); +		if (!was_rmapped) { +			rmap_count = rmap_add(vcpu, sptep, gfn); +			if (rmap_count > RMAP_RECYCLE_THRESHOLD) +				rmap_recycle(vcpu, sptep, gfn); +		}  	}  	kvm_release_pfn_clean(pfn);  	if (speculative) { @@ -2170,8 +2363,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,  	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);  	if (!slot) { -		get_page(bad_page); -		return page_to_pfn(bad_page); +		get_page(fault_page); +		return page_to_pfn(fault_page);  	}  	hva = gfn_to_hva_memslot(slot, gfn); @@ -2198,7 +2391,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,  	for (i = 0; i < ret; i++, gfn++, start++)  		mmu_set_spte(vcpu, start, ACC_ALL, -			     access, 0, 0, 1, NULL, +			     access, 0, 0, NULL,  			     sp->role.level, gfn,  			     page_to_pfn(pages[i]), true, true); @@ -2217,7 +2410,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,  	spte = sp->spt + i;  	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { -		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { +		if (is_shadow_present_pte(*spte) || spte == sptep) {  			if (!start)  				continue;  			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) @@ -2254,7 +2447,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  {  	struct kvm_shadow_walk_iterator iterator;  	struct kvm_mmu_page *sp; -	int pt_write = 0; +	int emulate = 0;  	gfn_t pseudo_gfn;  	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { @@ -2262,14 +2455,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  			unsigned pte_access = ACC_ALL;  			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, -				     0, write, 1, &pt_write, +				     0, write, &emulate,  				     level, gfn, pfn, prefault, map_writable);  			direct_pte_prefetch(vcpu, iterator.sptep);  			++vcpu->stat.pf_fixed;  			break;  		} -		if (*iterator.sptep == shadow_trap_nonpresent_pte) { +		if (!is_shadow_present_pte(*iterator.sptep)) {  			u64 base_addr = iterator.addr;  			base_addr &= PT64_LVL_ADDR_MASK(iterator.level); @@ -2283,14 +2476,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  				return -ENOMEM;  			} -			__set_spte(iterator.sptep, -				   __pa(sp->spt) -				   | PT_PRESENT_MASK | PT_WRITABLE_MASK -				   | shadow_user_mask | shadow_x_mask -				   | shadow_accessed_mask); +			mmu_spte_set(iterator.sptep, +				     __pa(sp->spt) +				     | PT_PRESENT_MASK | PT_WRITABLE_MASK +				     | shadow_user_mask | shadow_x_mask +				     | shadow_accessed_mask);  		}  	} -	return pt_write; +	return emulate;  }  static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) @@ -2306,16 +2499,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *  	send_sig_info(SIGBUS, &info, tsk);  } -static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)  {  	kvm_release_pfn_clean(pfn);  	if (is_hwpoison_pfn(pfn)) { -		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); +		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);  		return 0; -	} else if (is_fault_pfn(pfn)) -		return -EFAULT; +	} -	return 1; +	return -EFAULT;  }  static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, @@ -2360,6 +2552,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,  	}  } +static bool mmu_invalid_pfn(pfn_t pfn) +{ +	return unlikely(is_invalid_pfn(pfn)); +} + +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, +				pfn_t pfn, unsigned access, int *ret_val) +{ +	bool ret = true; + +	/* The pfn is invalid, report the error! */ +	if (unlikely(is_invalid_pfn(pfn))) { +		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); +		goto exit; +	} + +	if (unlikely(is_noslot_pfn(pfn))) +		vcpu_cache_mmio_info(vcpu, gva, gfn, access); + +	ret = false; +exit: +	return ret; +} +  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,  			 gva_t gva, pfn_t *pfn, bool write, bool *writable); @@ -2394,9 +2610,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,  	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))  		return 0; -	/* mmio */ -	if (is_error_pfn(pfn)) -		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); +	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) +		return r;  	spin_lock(&vcpu->kvm->mmu_lock);  	if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2623,6 +2838,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)  	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))  		return; +	vcpu_clear_mmio_info(vcpu, ~0ul);  	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);  	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {  		hpa_t root = vcpu->arch.mmu.root_hpa; @@ -2667,6 +2883,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,  	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);  } +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ +	if (direct) +		return vcpu_match_mmio_gpa(vcpu, addr); + +	return vcpu_match_mmio_gva(vcpu, addr); +} + + +/* + * On direct hosts, the last spte is only allows two states + * for mmio page fault: + *   - It is the mmio spte + *   - It is zapped or it is being zapped. + * + * This function completely checks the spte when the last spte + * is not the mmio spte. + */ +static bool check_direct_spte_mmio_pf(u64 spte) +{ +	return __check_direct_spte_mmio_pf(spte); +} + +static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +{ +	struct kvm_shadow_walk_iterator iterator; +	u64 spte = 0ull; + +	walk_shadow_page_lockless_begin(vcpu); +	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) +		if (!is_shadow_present_pte(spte)) +			break; +	walk_shadow_page_lockless_end(vcpu); + +	return spte; +} + +/* + * If it is a real mmio page fault, return 1 and emulat the instruction + * directly, return 0 to let CPU fault again on the address, -1 is + * returned if bug is detected. + */ +int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +{ +	u64 spte; + +	if (quickly_check_mmio_pf(vcpu, addr, direct)) +		return 1; + +	spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + +	if (is_mmio_spte(spte)) { +		gfn_t gfn = get_mmio_spte_gfn(spte); +		unsigned access = get_mmio_spte_access(spte); + +		if (direct) +			addr = 0; + +		trace_handle_mmio_page_fault(addr, gfn, access); +		vcpu_cache_mmio_info(vcpu, addr, gfn, access); +		return 1; +	} + +	/* +	 * It's ok if the gva is remapped by other cpus on shadow guest, +	 * it's a BUG if the gfn is not a mmio page. +	 */ +	if (direct && !check_direct_spte_mmio_pf(spte)) +		return -1; + +	/* +	 * If the page table is zapped by other cpus, let CPU fault again on +	 * the address. +	 */ +	return 0; +} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); + +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, +				  u32 error_code, bool direct) +{ +	int ret; + +	ret = handle_mmio_page_fault_common(vcpu, addr, direct); +	WARN_ON(ret < 0); +	return ret; +} +  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,  				u32 error_code, bool prefault)  { @@ -2674,6 +2978,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,  	int r;  	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + +	if (unlikely(error_code & PFERR_RSVD_MASK)) +		return handle_mmio_page_fault(vcpu, gva, error_code, true); +  	r = mmu_topup_memory_caches(vcpu);  	if (r)  		return r; @@ -2750,6 +3058,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,  	ASSERT(vcpu);  	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); +	if (unlikely(error_code & PFERR_RSVD_MASK)) +		return handle_mmio_page_fault(vcpu, gpa, error_code, true); +  	r = mmu_topup_memory_caches(vcpu);  	if (r)  		return r; @@ -2767,9 +3078,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,  	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))  		return 0; -	/* mmio */ -	if (is_error_pfn(pfn)) -		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); +	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) +		return r; +  	spin_lock(&vcpu->kvm->mmu_lock);  	if (mmu_notifier_retry(vcpu, mmu_seq))  		goto out_unlock; @@ -2800,7 +3111,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,  	context->page_fault = nonpaging_page_fault;  	context->gva_to_gpa = nonpaging_gva_to_gpa;  	context->free = nonpaging_free; -	context->prefetch_page = nonpaging_prefetch_page;  	context->sync_page = nonpaging_sync_page;  	context->invlpg = nonpaging_invlpg;  	context->update_pte = nonpaging_update_pte; @@ -2848,6 +3158,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)  	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;  } +static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, +			   int *nr_present) +{ +	if (unlikely(is_mmio_spte(*sptep))) { +		if (gfn != get_mmio_spte_gfn(*sptep)) { +			mmu_spte_clear_no_track(sptep); +			return true; +		} + +		(*nr_present)++; +		mark_mmio_spte(sptep, gfn, access); +		return true; +	} + +	return false; +} +  #define PTTYPE 64  #include "paging_tmpl.h"  #undef PTTYPE @@ -2930,7 +3257,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,  	context->new_cr3 = paging_new_cr3;  	context->page_fault = paging64_page_fault;  	context->gva_to_gpa = paging64_gva_to_gpa; -	context->prefetch_page = paging64_prefetch_page;  	context->sync_page = paging64_sync_page;  	context->invlpg = paging64_invlpg;  	context->update_pte = paging64_update_pte; @@ -2959,7 +3285,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,  	context->page_fault = paging32_page_fault;  	context->gva_to_gpa = paging32_gva_to_gpa;  	context->free = paging_free; -	context->prefetch_page = paging32_prefetch_page;  	context->sync_page = paging32_sync_page;  	context->invlpg = paging32_invlpg;  	context->update_pte = paging32_update_pte; @@ -2984,7 +3309,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)  	context->new_cr3 = nonpaging_new_cr3;  	context->page_fault = tdp_page_fault;  	context->free = nonpaging_free; -	context->prefetch_page = nonpaging_prefetch_page;  	context->sync_page = nonpaging_sync_page;  	context->invlpg = nonpaging_invlpg;  	context->update_pte = nonpaging_update_pte; @@ -3023,6 +3347,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)  int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)  {  	int r; +	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);  	ASSERT(vcpu);  	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -3037,6 +3362,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)  	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);  	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu); +	vcpu->arch.mmu.base_role.smep_andnot_wp +		= smep && !is_write_protection(vcpu);  	return r;  } @@ -3141,27 +3468,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_mmu_unload); -static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, -				  struct kvm_mmu_page *sp, -				  u64 *spte) -{ -	u64 pte; -	struct kvm_mmu_page *child; - -	pte = *spte; -	if (is_shadow_present_pte(pte)) { -		if (is_last_spte(pte, sp->role.level)) -			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); -		else { -			child = page_header(pte & PT64_BASE_ADDR_MASK); -			mmu_page_remove_parent_pte(child, spte); -		} -	} -	__set_spte(spte, shadow_trap_nonpresent_pte); -	if (is_large_pte(pte)) -		--vcpu->kvm->stat.lpages; -} -  static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,  				  struct kvm_mmu_page *sp, u64 *spte,  				  const void *new) @@ -3233,6 +3539,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  	int level, npte, invlpg_counter, r, flooded = 0;  	bool remote_flush, local_flush, zap_page; +	/* +	 * If we don't have indirect shadow pages, it means no page is +	 * write-protected, so we can exit simply. +	 */ +	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) +		return; +  	zap_page = remote_flush = local_flush = false;  	offset = offset_in_page(gpa); @@ -3336,7 +3649,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  		spte = &sp->spt[page_offset / sizeof(*spte)];  		while (npte--) {  			entry = *spte; -			mmu_pte_write_zap_pte(vcpu, sp, spte); +			mmu_page_zap_pte(vcpu->kvm, sp, spte);  			if (gentry &&  			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)  			      & mask.word)) @@ -3380,9 +3693,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)  		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,  				  struct kvm_mmu_page, link);  		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); -		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  		++vcpu->kvm->stat.mmu_recycled;  	} +	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  }  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, @@ -3506,15 +3819,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  				continue;  			if (is_large_pte(pt[i])) { -				drop_spte(kvm, &pt[i], -					  shadow_trap_nonpresent_pte); +				drop_spte(kvm, &pt[i]);  				--kvm->stat.lpages;  				continue;  			}  			/* avoid RMW */  			if (is_writable_pte(pt[i])) -				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); +				mmu_spte_update(&pt[i], +						pt[i] & ~PT_WRITABLE_MASK);  		}  	}  	kvm_flush_remote_tlbs(kvm); @@ -3590,25 +3903,18 @@ static struct shrinker mmu_shrinker = {  static void mmu_destroy_caches(void)  { -	if (pte_chain_cache) -		kmem_cache_destroy(pte_chain_cache); -	if (rmap_desc_cache) -		kmem_cache_destroy(rmap_desc_cache); +	if (pte_list_desc_cache) +		kmem_cache_destroy(pte_list_desc_cache);  	if (mmu_page_header_cache)  		kmem_cache_destroy(mmu_page_header_cache);  }  int kvm_mmu_module_init(void)  { -	pte_chain_cache = kmem_cache_create("kvm_pte_chain", -					    sizeof(struct kvm_pte_chain), -					    0, 0, NULL); -	if (!pte_chain_cache) -		goto nomem; -	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", -					    sizeof(struct kvm_rmap_desc), +	pte_list_desc_cache = kmem_cache_create("pte_list_desc", +					    sizeof(struct pte_list_desc),  					    0, 0, NULL); -	if (!rmap_desc_cache) +	if (!pte_list_desc_cache)  		goto nomem;  	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", @@ -3775,16 +4081,17 @@ out:  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])  {  	struct kvm_shadow_walk_iterator iterator; +	u64 spte;  	int nr_sptes = 0; -	spin_lock(&vcpu->kvm->mmu_lock); -	for_each_shadow_entry(vcpu, addr, iterator) { -		sptes[iterator.level-1] = *iterator.sptep; +	walk_shadow_page_lockless_begin(vcpu); +	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { +		sptes[iterator.level-1] = spte;  		nr_sptes++; -		if (!is_shadow_present_pte(*iterator.sptep)) +		if (!is_shadow_present_pte(spte))  			break;  	} -	spin_unlock(&vcpu->kvm->mmu_lock); +	walk_shadow_page_lockless_end(vcpu);  	return nr_sptes;  }  |