diff options
Diffstat (limited to 'arch/x86/mm/tlb.c')
| -rw-r--r-- | arch/x86/mm/tlb.c | 198 | 
1 files changed, 122 insertions, 76 deletions
| diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e96b99eb800c..bddd6b3cee1d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,6 +7,7 @@  #include <linux/export.h>  #include <linux/cpu.h>  #include <linux/debugfs.h> +#include <linux/ptrace.h>  #include <asm/tlbflush.h>  #include <asm/mmu_context.h> @@ -180,13 +181,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)  	}  } +static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +{ +	/* +	 * Check if the current (previous) task has access to the memory +	 * of the @tsk (next) task. If access is denied, make sure to +	 * issue a IBPB to stop user->user Spectre-v2 attacks. +	 * +	 * Note: __ptrace_may_access() returns 0 or -ERRNO. +	 */ +	return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && +		ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); +} +  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			struct task_struct *tsk)  {  	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);  	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);  	unsigned cpu = smp_processor_id();  	u64 next_tlb_gen; +	bool need_flush; +	u16 new_asid;  	/*  	 * NB: The scheduler will call us with prev == next when switching @@ -240,20 +257,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			   next->context.ctx_id);  		/* -		 * We don't currently support having a real mm loaded without -		 * our cpu set in mm_cpumask().  We have all the bookkeeping -		 * in place to figure out whether we would need to flush -		 * if our cpu were cleared in mm_cpumask(), but we don't -		 * currently use it. +		 * Even in lazy TLB mode, the CPU should stay set in the +		 * mm_cpumask. The TLB shootdown code can figure out from +		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.  		 */  		if (WARN_ON_ONCE(real_prev != &init_mm &&  				 !cpumask_test_cpu(cpu, mm_cpumask(next))))  			cpumask_set_cpu(cpu, mm_cpumask(next)); -		return; +		/* +		 * If the CPU is not in lazy TLB mode, we are just switching +		 * from one thread in a process to another thread in the same +		 * process. No TLB flush required. +		 */ +		if (!was_lazy) +			return; + +		/* +		 * Read the tlb_gen to check whether a flush is needed. +		 * If the TLB is up to date, just use it. +		 * The barrier synchronizes with the tlb_gen increment in +		 * the TLB shootdown code. +		 */ +		smp_mb(); +		next_tlb_gen = atomic64_read(&next->context.tlb_gen); +		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == +				next_tlb_gen) +			return; + +		/* +		 * TLB contents went out of date while we were in lazy +		 * mode. Fall through to the TLB switching code below. +		 */ +		new_asid = prev_asid; +		need_flush = true;  	} else { -		u16 new_asid; -		bool need_flush;  		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);  		/* @@ -262,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		 * one process from doing Spectre-v2 attacks on another.  		 *  		 * As an optimization, flush indirect branches only when -		 * switching into processes that disable dumping. This -		 * protects high value processes like gpg, without having -		 * too high performance overhead. IBPB is *expensive*! -		 * -		 * This will not flush branches when switching into kernel -		 * threads. It will also not flush if we switch to idle -		 * thread and back to the same process. It will flush if we -		 * switch to a different non-dumpable process. +		 * switching into a processes that can't be ptrace by the +		 * current one (as in such case, attacker has much more +		 * convenient way how to tamper with the next process than +		 * branch buffer poisoning).  		 */ -		if (tsk && tsk->mm && -		    tsk->mm->context.ctx_id != last_ctx_id && -		    get_dumpable(tsk->mm) != SUID_DUMP_USER) +		if (static_cpu_has(X86_FEATURE_USE_IBPB) && +				ibpb_needed(tsk, last_ctx_id))  			indirect_branch_prediction_barrier();  		if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -308,46 +341,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		/* Let nmi_uaccess_okay() know that we're changing CR3. */  		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);  		barrier(); +	} -		if (need_flush) { -			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); -			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -			load_new_mm_cr3(next->pgd, new_asid, true); - -			/* -			 * NB: This gets called via leave_mm() in the idle path -			 * where RCU functions differently.  Tracing normally -			 * uses RCU, so we need to use the _rcuidle variant. -			 * -			 * (There is no good reason for this.  The idle code should -			 *  be rearranged to call this before rcu_idle_enter().) -			 */ -			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); -		} else { -			/* The new ASID is already up to date. */ -			load_new_mm_cr3(next->pgd, new_asid, false); - -			/* See above wrt _rcuidle. */ -			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); -		} +	if (need_flush) { +		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); +		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); +		load_new_mm_cr3(next->pgd, new_asid, true);  		/* -		 * Record last user mm's context id, so we can avoid -		 * flushing branch buffer with IBPB if we switch back -		 * to the same user. +		 * NB: This gets called via leave_mm() in the idle path +		 * where RCU functions differently.  Tracing normally +		 * uses RCU, so we need to use the _rcuidle variant. +		 * +		 * (There is no good reason for this.  The idle code should +		 *  be rearranged to call this before rcu_idle_enter().)  		 */ -		if (next != &init_mm) -			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - -		/* Make sure we write CR3 before loaded_mm. */ -		barrier(); +		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); +	} else { +		/* The new ASID is already up to date. */ +		load_new_mm_cr3(next->pgd, new_asid, false); -		this_cpu_write(cpu_tlbstate.loaded_mm, next); -		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); +		/* See above wrt _rcuidle. */ +		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);  	} -	load_mm_cr4(next); -	switch_ldt(real_prev, next); +	/* +	 * Record last user mm's context id, so we can avoid +	 * flushing branch buffer with IBPB if we switch back +	 * to the same user. +	 */ +	if (next != &init_mm) +		this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + +	/* Make sure we write CR3 before loaded_mm. */ +	barrier(); + +	this_cpu_write(cpu_tlbstate.loaded_mm, next); +	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + +	if (next != real_prev) { +		load_mm_cr4(next); +		switch_ldt(real_prev, next); +	}  }  /* @@ -368,20 +403,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)  	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)  		return; -	if (tlb_defer_switch_to_init_mm()) { -		/* -		 * There's a significant optimization that may be possible -		 * here.  We have accurate enough TLB flush tracking that we -		 * don't need to maintain coherence of TLB per se when we're -		 * lazy.  We do, however, need to maintain coherence of -		 * paging-structure caches.  We could, in principle, leave our -		 * old mm loaded and only switch to init_mm when -		 * tlb_remove_page() happens. -		 */ -		this_cpu_write(cpu_tlbstate.is_lazy, true); -	} else { -		switch_mm(NULL, &init_mm, NULL); -	} +	this_cpu_write(cpu_tlbstate.is_lazy, true);  }  /* @@ -468,6 +490,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,  		 * paging-structure cache to avoid speculatively reading  		 * garbage into our TLB.  Since switching to init_mm is barely  		 * slower than a minimal flush, just switch to init_mm. +		 * +		 * This should be rare, with native_flush_tlb_others skipping +		 * IPIs to lazy TLB mode CPUs.  		 */  		switch_mm_irqs_off(NULL, &init_mm, NULL);  		return; @@ -528,17 +553,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,  	    f->new_tlb_gen == local_tlb_gen + 1 &&  	    f->new_tlb_gen == mm_tlb_gen) {  		/* Partial flush */ -		unsigned long addr; -		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; +		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; +		unsigned long addr = f->start; -		addr = f->start;  		while (addr < f->end) {  			__flush_tlb_one_user(addr); -			addr += PAGE_SIZE; +			addr += 1UL << f->stride_shift;  		}  		if (local) -			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); -		trace_tlb_flush(reason, nr_pages); +			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); +		trace_tlb_flush(reason, nr_invalidate);  	} else {  		/* Full flush. */  		local_flush_tlb(); @@ -571,6 +595,11 @@ static void flush_tlb_func_remote(void *info)  	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);  } +static bool tlb_is_not_lazy(int cpu, void *data) +{ +	return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} +  void native_flush_tlb_others(const struct cpumask *cpumask,  			     const struct flush_tlb_info *info)  { @@ -606,8 +635,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,  					       (void *)info, 1);  		return;  	} -	smp_call_function_many(cpumask, flush_tlb_func_remote, + +	/* +	 * If no page tables were freed, we can skip sending IPIs to +	 * CPUs in lazy TLB mode. They will flush the CPU themselves +	 * at the next context switch. +	 * +	 * However, if page tables are getting freed, we need to send the +	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping +	 * up on the new contents of what used to be page tables, while +	 * doing a speculative memory access. +	 */ +	if (info->freed_tables) +		smp_call_function_many(cpumask, flush_tlb_func_remote,  			       (void *)info, 1); +	else +		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, +				(void *)info, 1, GFP_ATOMIC, cpumask);  }  /* @@ -623,12 +667,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,  static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, -				unsigned long end, unsigned long vmflag) +				unsigned long end, unsigned int stride_shift, +				bool freed_tables)  {  	int cpu;  	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {  		.mm = mm, +		.stride_shift = stride_shift, +		.freed_tables = freed_tables,  	};  	cpu = get_cpu(); @@ -638,8 +685,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,  	/* Should we flush just the requested range? */  	if ((end != TLB_FLUSH_ALL) && -	    !(vmflag & VM_HUGETLB) && -	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { +	    ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {  		info.start = start;  		info.end = end;  	} else { |