From 6b32c126d33d5cb379bca280ab8acedc1ca978ff Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 Oct 2017 20:30:12 +0200 Subject: x86/alternatives: Fix alt_max_short macro to really be a max() The alt_max_short() macro in asm/alternative.h does not work as intended, leading to nasty bugs. E.g. alt_max_short("1", "3") evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not exactly the maximum of 1 and 3. In fact, I had to learn it the hard way by crashing my kernel in not so funny ways by attempting to make use of the ALTENATIVE_2 macro with alternatives where the first one was larger than the second one. According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") the right handed side should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the macro work as intended. While at it, fix up the comments regarding the additional "-", too. It's not about gas' usage of s32 but brain dead logic of having a "true" value of -1 for the < operator ... *sigh* Btw., the one in asm/alternative-asm.h is correct. And, apparently, all current users of ALTERNATIVE_2() pass same sized alternatives, avoiding to hit the bug. [1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax Reviewed-and-tested-by: Borislav Petkov Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") Signed-off-by: Mathias Krause Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com --- arch/x86/include/asm/alternative-asm.h | 4 +++- arch/x86/include/asm/alternative.h | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index c096624137ae..ccbe24e697c4 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is -- cgit From a3b7424392924e778b608e30ee321f7b10cc94b8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Oct 2017 17:48:54 +0200 Subject: x86/hyperv: Clear vCPU banks between calls to avoid flushing unneeded vCPUs hv_flush_pcpu_ex structures are not cleared between calls for performance reasons (they're variable size up to PAGE_SIZE each) but we must clear hv_vp_set.bank_contents part of it to avoid flushing unneeded vCPUs. The rest of the structure is formed correctly. To do the clearing in an efficient way stash the maximum possible vCPU number (this may differ from Linux CPU id). Reported-by: Jork Loeser Signed-off-by: Vitaly Kuznetsov Cc: Dexuan Cui Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: devel@linuxdriverproject.org Link: http://lkml.kernel.org/r/20171006154854.18092-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_init.c | 5 +++++ arch/x86/hyperv/mmu.c | 17 ++++++++++++----- arch/x86/include/asm/mshyperv.h | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a8eb550c40f..a5db63f728a2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -85,6 +85,8 @@ EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); +u32 hv_max_vp_index; + static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -93,6 +95,9 @@ static int hv_cpu_init(unsigned int cpu) hv_vp_index[smp_processor_id()] = msr_vp_index; + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + return 0; } diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 39e7f6e50919..9502d04c0c95 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -76,6 +76,18 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, { int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + /* * Some banks may end up being empty but this is acceptable. */ @@ -83,11 +95,6 @@ static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, vcpu = hv_cpu_number_to_vp_number(cpu); vcpu_bank = vcpu / 64; vcpu_offset = vcpu % 64; - - /* valid_bank_mask can represent up to 64 banks */ - if (vcpu_bank >= 64) - return 0; - __set_bit(vcpu_offset, (unsigned long *) &flush->hv_vp_set.bank_contents[vcpu_bank]); if (vcpu_bank >= nr_bank) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 738503e1f80c..530f448fddaf 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -289,6 +289,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, * to this information. */ extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; /** * hv_cpu_number_to_vp_number() - Map CPU to VP. -- cgit From b956575bed91ecfb136a8300742ecbbf451471ab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 9 Oct 2017 09:50:49 -0700 Subject: x86/mm: Flush more aggressively in lazy TLB mode Since commit: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") x86's lazy TLB mode has been all the way lazy: when running a kernel thread (including the idle thread), the kernel keeps using the last user mm's page tables without attempting to maintain user TLB coherence at all. From a pure semantic perspective, this is fine -- kernel threads won't attempt to access user pages, so having stale TLB entries doesn't matter. Unfortunately, I forgot about a subtlety. By skipping TLB flushes, we also allow any paging-structure caches that may exist on the CPU to become incoherent. This means that we can have a paging-structure cache entry that references a freed page table, and the CPU is within its rights to do a speculative page walk starting at the freed page table. I can imagine this causing two different problems: - A speculative page walk starting from a bogus page table could read IO addresses. I haven't seen any reports of this causing problems. - A speculative page walk that involves a bogus page table can install garbage in the TLB. Such garbage would always be at a user VA, but some AMD CPUs have logic that triggers a machine check when it notices these bogus entries. I've seen a couple reports of this. Boris further explains the failure mode: > It is actually more of an optimization which assumes that paging-structure > entries are in WB DRAM: > > "TlbCacheDis: cacheable memory disable. Read-write. 0=Enables > performance optimization that assumes PML4, PDP, PDE, and PTE entries > are in cacheable WB-DRAM; memory type checks may be bypassed, and > addresses outside of WB-DRAM may result in undefined behavior or NB > protocol errors. 1=Disables performance optimization and allows PML4, > PDP, PDE and PTE entries to be in any memory type. Operating systems > that maintain page tables in memory types other than WB- DRAM must set > TlbCacheDis to insure proper operation." > > The MCE generated is an NB protocol error to signal that > > "Link: A specific coherent-only packet from a CPU was issued to an > IO link. This may be caused by software which addresses page table > structures in a memory type other than cacheable WB-DRAM without > properly configuring MSRC001_0015[TlbCacheDis]. This may occur, for > example, when page table structure addresses are above top of memory. In > such cases, the NB will generate an MCE if it sees a mismatch between > the memory operation generated by the core and the link type." > > I'm assuming coherent-only packets don't go out on IO links, thus the > error. To fix this, reinstate TLB coherence in lazy mode. With this patch applied, we do it in one of two ways: - If we have PCID, we simply switch back to init_mm's page tables when we enter a kernel thread -- this seems to be quite cheap except for the cost of serializing the CPU. - If we don't have PCID, then we set a flag and switch to init_mm the first time we would otherwise need to flush the TLB. The /sys/kernel/debug/x86/tlb_use_lazy_mode debug switch can be changed to override the default mode for benchmarking. In theory, we could optimize this better by only flushing the TLB in lazy CPUs when a page table is freed. Doing that would require auditing the mm code to make sure that all page table freeing goes through tlb_remove_page() as well as reworking some data structures to implement the improved flush logic. Reported-by: Markus Trippelsdorf Reported-by: Adam Borowski Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Borkmann Cc: Eric Biggers Cc: Johannes Hirte Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Roman Kagan Cc: Thomas Gleixner Fixes: 94b1b03b519b ("x86/mm: Rework lazy TLB mode and TLB freshness tracking") Link: http://lkml.kernel.org/r/20171009170231.fkpraqokz6e4zeco@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 8 +- arch/x86/include/asm/tlbflush.h | 24 ++++++ arch/x86/mm/tlb.c | 153 +++++++++++++++++++++++++++---------- 3 files changed, 136 insertions(+), 49 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c120b5db178a..3c856a15b98e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -126,13 +126,7 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_clear_cpu(cpu, mm_cpumask(mm)); -} +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4893abf7f74f..d362161d3291 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,6 +82,13 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If + * it's false, then we immediately switch CR3 when entering a kernel thread. + */ +DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + /* * 6 because 6 should be plenty and struct tlb_state will fit in * two cache lines. @@ -104,6 +111,23 @@ struct tlb_state { u16 loaded_mm_asid; u16 next_asid; + /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49d9778376d7..658bf0090565 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -30,6 +30,8 @@ atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); +DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, u16 *new_asid, bool *need_flush) { @@ -80,7 +82,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif + this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); - if (cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * There's nothing to do: we weren't lazy, and we - * aren't changing our mm. We don't need to flush - * anything, nor do we need to update CR3, CR4, or - * LDTR. - */ - return; - } - - /* Resume remote flushes and then read tlb_gen. */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < - next_tlb_gen) { - /* - * Ideally, we'd have a flush_tlb() variant that - * takes the known CR3 value as input. This would - * be faster on Xen PV and on hypothetical CPUs - * on which INVPCID is fast. - */ - this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, - next_tlb_gen); - write_cr3(build_cr3(next, prev_asid)); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); - } - /* - * We just exited lazy mode, which means that CR4 and/or LDTR - * may be stale. (Changes to the required CR4 and LDTR states - * are not reflected in tlb_gen.) + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; } else { u16 new_asid; bool need_flush; @@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } /* Stop remote flushes for the previous mm */ - if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. @@ -232,6 +212,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_ldt(real_prev, next); } +/* + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (static_branch_unlikely(&tlb_use_lazy_mode)) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} + /* * Call this when reinitializing a CPU. It fixes the following potential * problems: @@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (unlikely(loaded_mm == &init_mm)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { + if (this_cpu_read(cpu_tlbstate.is_lazy)) { /* - * We're in lazy mode -- don't flush. We can get here on - * remote flushes due to races and on local flushes if a - * kernel thread coincidentally flushes the mm it's lazily - * still using. + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. */ + switch_mm_irqs_off(NULL, &init_mm, NULL); return; } @@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[2]; + + buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0'; + buf[1] = '\n'; + + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t tlblazy_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + bool val; + + if (kstrtobool_from_user(user_buf, count, &val)) + return -EINVAL; + + if (val) + static_branch_enable(&tlb_use_lazy_mode); + else + static_branch_disable(&tlb_use_lazy_mode); + + return count; +} + +static const struct file_operations fops_tlblazy = { + .read = tlblazy_read_file, + .write = tlblazy_write_file, + .llseek = default_llseek, +}; + +static int __init init_tlb_use_lazy_mode(void) +{ + if (boot_cpu_has(X86_FEATURE_PCID)) { + /* + * Heuristic: with PCID on, switching to and from + * init_mm is reasonably fast, but remote flush IPIs + * as expensive as ever, so turn off lazy TLB mode. + * + * We can't do this in setup_pcid() because static keys + * haven't been initialized yet, and it would blow up + * badly. + */ + static_branch_disable(&tlb_use_lazy_mode); + } + + debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlblazy); + return 0; +} +late_initcall(init_tlb_use_lazy_mode); -- cgit