From 1f5e7eb7868e42227ac426c96d437117e6e06e8e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Apr 2023 18:37:00 +0200 Subject: x86/smp: Make stop_other_cpus() more robust Tony reported intermittent lockups on poweroff. His analysis identified the wbinvd() in stop_this_cpu() as the culprit. This was added to ensure that on SME enabled machines a kexec() does not leave any stale data in the caches when switching from encrypted to non-encrypted mode or vice versa. That wbinvd() is conditional on the SME feature bit which is read directly from CPUID. But that readout does not check whether the CPUID leaf is available or not. If it's not available the CPU will return the value of the highest supported leaf instead. Depending on the content the "SME" bit might be set or not. That's incorrect but harmless. Making the CPUID readout conditional makes the observed hangs go away, but it does not fix the underlying problem: CPU0 CPU1 stop_other_cpus() send_IPIs(REBOOT); stop_this_cpu() while (num_online_cpus() > 1); set_online(false); proceed... -> hang wbinvd() WBINVD is an expensive operation and if multiple CPUs issue it at the same time the resulting delays are even larger. But CPU0 already observed num_online_cpus() going down to 1 and proceeds which causes the system to hang. This issue exists independent of WBINVD, but the delays caused by WBINVD make it more prominent. Make this more robust by adding a cpumask which is initialized to the online CPU mask before sending the IPIs and CPUs clear their bit in stop_this_cpu() after the WBINVD completed. Check for that cpumask to become empty in stop_other_cpus() instead of watching num_online_cpus(). The cpumask cannot plug all holes either, but it's better than a raw counter and allows to restrict the NMI fallback IPI to be sent only the CPUs which have not reported within the timeout window. Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use") Reported-by: Tony Battersby Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Ashok Raj Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.com Link: https://lore.kernel.org/r/87h6r770bv.ffs@tglx --- arch/x86/kernel/process.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/process.c') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index dac41a0072ea..05924bc4b3af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -759,13 +759,23 @@ bool xen_set_default_idle(void) } #endif +struct cpumask cpus_stop_mask; + void __noreturn stop_this_cpu(void *dummy) { + unsigned int cpu = smp_processor_id(); + local_irq_disable(); + /* - * Remove this CPU: + * Remove this CPU from the online mask and disable it + * unconditionally. This might be redundant in case that the reboot + * vector was handled late and stop_other_cpus() sent an NMI. + * + * According to SDM and APM NMIs can be accepted even after soft + * disabling the local APIC. */ - set_cpu_online(smp_processor_id(), false); + set_cpu_online(cpu, false); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); @@ -783,6 +793,15 @@ void __noreturn stop_this_cpu(void *dummy) */ if (cpuid_eax(0x8000001f) & BIT(0)) native_wbinvd(); + + /* + * This brings a cache line back and dirties it, but + * native_stop_other_cpus() will overwrite cpus_stop_mask after it + * observed that all CPUs reported stop. This write will invalidate + * the related cache line on this CPU. + */ + cpumask_clear_cpu(cpu, &cpus_stop_mask); + for (;;) { /* * Use native_halt() so that memory contents don't change -- cgit From 9b040453d4440659f33dc6f0aa26af418ebfe70b Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Thu, 15 Jun 2023 22:33:52 +0200 Subject: x86/smp: Dont access non-existing CPUID leaf stop_this_cpu() tests CPUID leaf 0x8000001f::EAX unconditionally. Intel CPUs return the content of the highest supported leaf when a non-existing leaf is read, while AMD CPUs return all zeros for unsupported leafs. So the result of the test on Intel CPUs is lottery. While harmless it's incorrect and causes the conditional wbinvd() to be issued where not required. Check whether the leaf is supported before reading it. [ tglx: Adjusted changelog ] Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use") Signed-off-by: Tony Battersby Signed-off-by: Thomas Gleixner Reviewed-by: Mario Limonciello Reviewed-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.com Link: https://lore.kernel.org/r/20230615193330.322186388@linutronix.de --- arch/x86/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/process.c') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 05924bc4b3af..ff9b80a0e3e3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -763,6 +763,7 @@ struct cpumask cpus_stop_mask; void __noreturn stop_this_cpu(void *dummy) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); unsigned int cpu = smp_processor_id(); local_irq_disable(); @@ -777,7 +778,7 @@ void __noreturn stop_this_cpu(void *dummy) */ set_cpu_online(cpu, false); disable_local_APIC(); - mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support @@ -791,7 +792,7 @@ void __noreturn stop_this_cpu(void *dummy) * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ - if (cpuid_eax(0x8000001f) & BIT(0)) + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) native_wbinvd(); /* -- cgit From 3aec4ecb3d1f313a8ab985df7cab07c4af81f478 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 23 Jun 2023 18:55:29 -0400 Subject: x86: Rewrite ret_from_fork() in C When kCFI is enabled, special handling is needed for the indirect call to the kernel thread function. Rewrite the ret_from_fork() function in C so that the compiler can properly handle the indirect call. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Brian Gerst Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230623225529.34590-3-brgerst@gmail.com --- arch/x86/entry/entry_32.S | 30 ++++++++---------------------- arch/x86/entry/entry_64.S | 33 ++++++++------------------------- arch/x86/include/asm/switch_to.h | 4 +++- arch/x86/kernel/process.c | 22 +++++++++++++++++++++- 4 files changed, 40 insertions(+), 49 deletions(-) (limited to 'arch/x86/kernel/process.c') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e56123f03a79..6e6af42e044a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,36 +727,22 @@ SYM_CODE_END(__switch_to_asm) * edi: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) +SYM_CODE_START(ret_from_fork_asm) + movl %esp, %edx /* regs */ + /* return address for the stack unwinder */ pushl $.Lsyscall_32_done FRAME_BEGIN - pushl %eax - call schedule_tail + /* prev already in EAX */ + movl %ebx, %ecx /* fn */ + pushl %edi /* fn_arg */ + call ret_from_fork addl $4, %esp FRAME_END - testl %ebx, %ebx - jnz 1f /* kernel threads are uncommon */ - -2: - /* When we fork, we trace the syscall return in the child, too. */ - leal 4(%esp), %eax - call syscall_exit_to_user_mode RET - - /* kernel thread */ -1: movl %edi, %eax - CALL_NOSPEC ebx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movl $0, PT_EAX(%esp) - jmp 2b -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) .popsection SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f31e286c2977..91f6818884fa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" - __FUNC_ALIGN -SYM_CODE_START_NOALIGN(ret_from_fork) - UNWIND_HINT_END_OF_STACK +SYM_CODE_START(ret_from_fork_asm) + UNWIND_HINT_REGS ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT - movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + movq %rax, %rdi /* prev */ + movq %rsp, %rsi /* regs */ + movq %rbx, %rdx /* fn */ + movq %r12, %rcx /* fn_arg */ + call ret_from_fork -2: - UNWIND_HINT_REGS - movq %rsp, %rdi - call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode - -1: - /* kernel thread */ - UNWIND_HINT_END_OF_STACK - movq %r12, %rdi - CALL_NOSPEC rbx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movq $0, RAX(%rsp) - jmp 2b -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) .popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5c91305d09d2..f42dbf17f52b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -asmlinkage void ret_from_fork(void); +asmlinkage void ret_from_fork_asm(void); +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg); /* * This is the structure pointed to by thread.sp for an inactive task. The diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff9b80a0e3e3..72015dba72ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg) +{ + schedule_tail(prev); + + /* Is this a kernel thread? */ + if (unlikely(fn)) { + fn(fn_arg); + /* + * A kernel thread is allowed to return here after successfully + * calling kernel_execve(). Exit to userspace to complete the + * execve() syscall. + */ + regs->ax = 0; + } + + syscall_exit_to_user_mode(regs); +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; @@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); - frame->ret_addr = (unsigned long) ret_from_fork; + frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; -- cgit