From 0e181bb58143cb4a2e8f01c281b0816cd0e4798e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:34 -0700 Subject: x86/nmi/64: Remove asm code that saves CR2 Now that do_nmi saves CR2, we don't need to save it in asm. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Acked-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3bb2c4302df1..062feb4eb478 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1415,28 +1415,11 @@ end_repeat_nmi: */ call paranoid_entry - /* - * Save off the CR2 register. If we take a page fault in the NMI then - * it could corrupt the CR2 value. If the NMI preempts a page fault - * handler before it was able to read the CR2 register, and then the - * NMI itself takes a page fault, the page fault that was preempted - * will read the information from the NMI page fault and not the - * origin fault. Save it off and restore it if it changes. - * Use the r12 callee-saved register. - */ - movq %cr2, %r12 - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi call do_nmi - /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 -1: testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit From 9b6e6a8334d56354853f9c255d1395c2ba570e0a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:35 -0700 Subject: x86/nmi/64: Switch stacks on userspace NMI entry Returning to userspace is tricky: IRET can fail, and ESPFIX can rearrange the stack prior to IRET. The NMI nesting fixup relies on a precise stack layout and atomic IRET. Rather than trying to teach the NMI nesting fixup to handle ESPFIX and failed IRET, punt: run NMIs that came from user mode on the normal kernel stack. This will make some nested NMIs visible to C code, but the C code is okay with that. As a side effect, this should speed up perf: it eliminates an RDMSR when NMIs come from user mode. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 62 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 062feb4eb478..8668bbdd2bca 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1250,18 +1250,72 @@ ENTRY(nmi) * a nested NMI that updated the copy interrupt stack frame, a * jump will be made to the repeat_nmi code that will handle the second * NMI. + * + * However, espfix prevents us from directly returning to userspace + * with a single IRET instruction. Similarly, IRET to user mode + * can fault. We therefore handle NMIs from user space like + * other IST entries. */ /* Use %rdx as our temp variable throughout */ pushq %rdx + testb $3, CS-RIP+8(%rsp) + jz .Lnmi_from_kernel + + /* + * NMI from user mode. We need to run on the thread stack, but we + * can't go through the normal entry paths: NMIs are masked, and + * we don't want to enable interrupts, because then we'll end + * up in an awkward situation in which IRQs are on but NMIs + * are off. + */ + + SWAPGS + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + pushq 5*8(%rdx) /* pt_regs->ss */ + pushq 4*8(%rdx) /* pt_regs->rsp */ + pushq 3*8(%rdx) /* pt_regs->flags */ + pushq 2*8(%rdx) /* pt_regs->cs */ + pushq 1*8(%rdx) /* pt_regs->rip */ + pushq $-1 /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq (%rdx) /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + /* - * If %cs was not the kernel segment, then the NMI triggered in user - * space, which means it is definitely not nested. + * At this point we no longer need to worry about stack damage + * due to nesting -- we're on the normal thread stack and we're + * done with the NMI stack. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi + + /* + * Return back to user mode. We must *not* do the normal exit + * work, because we don't want to enable interrupts. Fortunately, + * do_nmi doesn't modify pt_regs. + */ + SWAPGS + jmp restore_c_regs_and_iret + +.Lnmi_from_kernel: /* * Check the special variable on the stack to see if NMIs are * executing. -- cgit From 0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:36 -0700 Subject: x86/nmi/64: Improve nested NMI comments I found the nested NMI documentation to be difficult to follow. Improve the comments. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 158 +++++++++++++++++++++++++++------------------- arch/x86/kernel/nmi.c | 4 +- 2 files changed, 94 insertions(+), 68 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8668bbdd2bca..f54d63a60a3b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1237,11 +1237,12 @@ ENTRY(nmi) * If the variable is not set and the stack is not the NMI * stack then: * o Set the special variable on the stack - * o Copy the interrupt frame into a "saved" location on the stack - * o Copy the interrupt frame into a "copy" location on the stack + * o Copy the interrupt frame into an "outermost" location on the + * stack + * o Copy the interrupt frame into an "iret" location on the stack * o Continue processing the NMI * If the variable is set or the previous stack is the NMI stack: - * o Modify the "copy" location to jump to the repeate_nmi + * o Modify the "iret" location to jump to the repeat_nmi * o return back to the first NMI * * Now on exit of the first NMI, we first clear the stack variable @@ -1317,18 +1318,60 @@ ENTRY(nmi) .Lnmi_from_kernel: /* - * Check the special variable on the stack to see if NMIs are - * executing. + * Here's what our stack frame will look like: + * +---------------------------------------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +---------------------------------------------------------+ + * | temp storage for rdx | + * +---------------------------------------------------------+ + * | "NMI executing" variable | + * +---------------------------------------------------------+ + * | iret SS } Copied from "outermost" frame | + * | iret Return RSP } on each loop iteration; overwritten | + * | iret RFLAGS } by a nested NMI to force another | + * | iret CS } iteration if needed. | + * | iret RIP } | + * +---------------------------------------------------------+ + * | outermost SS } initialized in first_nmi; | + * | outermost Return RSP } will not be changed before | + * | outermost RFLAGS } NMI processing is done. | + * | outermost CS } Copied to "iret" frame on each | + * | outermost RIP } iteration. | + * +---------------------------------------------------------+ + * | pt_regs | + * +---------------------------------------------------------+ + * + * The "original" frame is used by hardware. Before re-enabling + * NMIs, we need to be done with it, and we need to leave enough + * space for the asm code here. + * + * We return by executing IRET while RSP points to the "iret" frame. + * That will either return for real or it will loop back into NMI + * processing. + * + * The "outermost" frame is copied to the "iret" frame on each + * iteration of the loop, so each iteration starts with the "iret" + * frame pointing to the final return target. + */ + + /* + * Determine whether we're a nested NMI. + * + * First check "NMI executing". If it's set, then we're nested. + * This will not detect if we interrupted an outer NMI just + * before IRET. */ cmpl $1, -8(%rsp) je nested_nmi /* - * Now test if the previous stack was an NMI stack. - * We need the double check. We check the NMI stack to satisfy the - * race when the first NMI clears the variable before returning. - * We check the variable because the first NMI could be in a - * breakpoint routine using a breakpoint stack. + * Now test if the previous stack was an NMI stack. This covers + * the case where we interrupt an outer NMI after it clears + * "NMI executing" but before IRET. */ lea 6*8(%rsp), %rdx /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ @@ -1344,9 +1387,11 @@ ENTRY(nmi) nested_nmi: /* - * Do nothing if we interrupted the fixup in repeat_nmi. - * It's about to repeat the NMI handler, so we are fine - * with ignoring this one. + * If we interrupted an NMI that is between repeat_nmi and + * end_repeat_nmi, then we must not modify the "iret" frame + * because it's being written by the outer NMI. That's okay; + * the outer NMI handler is about to call do_nmi anyway, + * so we can just resume the outer NMI. */ movq $repeat_nmi, %rdx cmpq 8(%rsp), %rdx @@ -1356,7 +1401,10 @@ nested_nmi: ja nested_nmi_out 1: - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + /* + * Modify the "iret" frame to point to repeat_nmi, forcing another + * iteration of NMI handling. + */ leaq -1*8(%rsp), %rdx movq %rdx, %rsp leaq -10*8(%rsp), %rdx @@ -1372,61 +1420,27 @@ nested_nmi: nested_nmi_out: popq %rdx - /* No need to check faults here */ + /* We are returning to kernel mode, so this cannot result in a fault. */ INTERRUPT_RETURN first_nmi: - /* - * Because nested NMIs will use the pushed location that we - * stored in rdx, we must keep that space available. - * Here's what our stack frame will look like: - * +-------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +-------------------------+ - * | temp storage for rdx | - * +-------------------------+ - * | NMI executing variable | - * +-------------------------+ - * | copied SS | - * | copied Return RSP | - * | copied RFLAGS | - * | copied CS | - * | copied RIP | - * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ - * | pt_regs | - * +-------------------------+ - * - * The saved stack frame is used to fix up the copied stack frame - * that a nested NMI may change to make the interrupted NMI iret jump - * to the repeat_nmi. The original stack frame and the temp storage - * is also used by nested NMIs and can not be trusted on exit. - */ - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + /* Restore rdx. */ movq (%rsp), %rdx - /* Set the NMI executing variable on the stack. */ + /* Set "NMI executing" on the stack. */ pushq $1 - /* Leave room for the "copied" frame */ + /* Leave room for the "iret" frame */ subq $(5*8), %rsp - /* Copy the stack frame to the Saved frame */ + /* Copy the "original" frame to the "outermost" frame */ .rept 5 pushq 11*8(%rsp) .endr /* Everything up to here is safe from nested NMIs */ +repeat_nmi: /* * If there was a nested NMI, the first NMI's iret will return * here. But NMIs are still enabled and we can take another @@ -1435,16 +1449,21 @@ first_nmi: * it will just return, as we are about to repeat an NMI anyway. * This makes it safe to copy to the stack frame that a nested * NMI will update. - */ -repeat_nmi: - /* - * Update the stack variable to say we are still in NMI (the update - * is benign for the non-repeat case, where 1 was pushed just above - * to this very stack slot). + * + * RSP is pointing to "outermost RIP". gsbase is unknown, but, if + * we're repeating an NMI, gsbase has the same value that it had on + * the first iteration. paranoid_entry will load the kernel + * gsbase if needed before we call do_nmi. + * + * Set "NMI executing" in case we came back here via IRET. */ movq $1, 10*8(%rsp) - /* Make another copy, this one may be modified by nested NMIs */ + /* + * Copy the "outermost" frame to the "iret" frame. NMIs that nest + * here must not modify the "iret" frame while we're writing to + * it or it will end up containing garbage. + */ addq $(10*8), %rsp .rept 5 pushq -6*8(%rsp) @@ -1453,9 +1472,9 @@ repeat_nmi: end_repeat_nmi: /* - * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception and reset our iret stack - * so that we repeat another NMI. + * Everything below this point can be preempted by a nested NMI. + * If this happens, then the inner NMI will change the "iret" + * frame to point back to repeat_nmi. */ pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK @@ -1481,11 +1500,18 @@ nmi_swapgs: nmi_restore: RESTORE_EXTRA_REGS RESTORE_C_REGS - /* Pop the extra iret frame at once */ + + /* Point RSP at the "iret" frame. */ REMOVE_PT_GPREGS_FROM_STACK 6*8 - /* Clear the NMI executing stack variable */ + /* Clear "NMI executing". */ movq $0, 5*8(%rsp) + + /* + * INTERRUPT_RETURN reads the "iret" frame and exits the NMI + * stack in a single instruction. We are returning to kernel + * mode, so this cannot result in a fault. + */ INTERRUPT_RETURN END(nmi) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d8766b1c9974..d05bd2e2ee91 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_regs *regs) NOKPROBE_SYMBOL(default_do_nmi); /* - * NMIs can hit breakpoints which will cause it to lose its NMI context - * with the CPU when the breakpoint or page fault does an IRET. + * NMIs can page fault or hit breakpoints which will cause it to lose + * its NMI context with the CPU when the breakpoint or page fault does an IRET. * * As a result, NMIs can nest if NMIs get unmasked due an IRET during * NMI processing. On x86_64, the asm glue protects us from nested NMIs -- cgit From a27507ca2d796cfa8d907de31ad730359c8a6d06 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:37 -0700 Subject: x86/nmi/64: Reorder nested NMI checks Check the repeat_nmi .. end_repeat_nmi special case first. The next patch will rework the RSP check and, as a side effect, the RSP check will no longer detect repeat_nmi .. end_repeat_nmi, so we'll need this ordering of the checks. Note: this is more subtle than it appears. The check for repeat_nmi .. end_repeat_nmi jumps straight out of the NMI code instead of adjusting the "iret" frame to force a repeat. This is necessary, because the code between repeat_nmi and end_repeat_nmi sets "NMI executing" and then writes to the "iret" frame itself. If a nested NMI comes in and modifies the "iret" frame while repeat_nmi is also modifying it, we'll end up with garbage. The old code got this right, as does the new code, but the new code is a bit more explicit. If we were to move the check right after the "NMI executing" check, then we'd get it wrong and have random crashes. ( Because the "NMI executing" check would jump to the code that would modify the "iret" frame without checking if the interrupted NMI was currently modifying it. ) Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f54d63a60a3b..5c4ab384b84f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1361,7 +1361,24 @@ ENTRY(nmi) /* * Determine whether we're a nested NMI. * - * First check "NMI executing". If it's set, then we're nested. + * If we interrupted kernel code between repeat_nmi and + * end_repeat_nmi, then we are a nested NMI. We must not + * modify the "iret" frame because it's being written by + * the outer NMI. That's okay; the outer NMI handler is + * about to about to call do_nmi anyway, so we can just + * resume the outer NMI. + */ + + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out +1: + + /* + * Now check "NMI executing". If it's set, then we're nested. * This will not detect if we interrupted an outer NMI just * before IRET. */ @@ -1386,21 +1403,6 @@ ENTRY(nmi) /* Ah, it is within the NMI stack, treat it as nested */ nested_nmi: - /* - * If we interrupted an NMI that is between repeat_nmi and - * end_repeat_nmi, then we must not modify the "iret" frame - * because it's being written by the outer NMI. That's okay; - * the outer NMI handler is about to call do_nmi anyway, - * so we can just resume the outer NMI. - */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out - -1: /* * Modify the "iret" frame to point to repeat_nmi, forcing another * iteration of NMI handling. -- cgit From 810bc075f78ff2c221536eb3008eac6a492dba2d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:38 -0700 Subject: x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI detection We have a tricky bug in the nested NMI code: if we see RSP pointing to the NMI stack on NMI entry from kernel mode, we assume that we are executing a nested NMI. This isn't quite true. A malicious userspace program can point RSP at the NMI stack, issue SYSCALL, and arrange for an NMI to happen while RSP is still pointing at the NMI stack. Fix it with a sneaky trick. Set DF in the region of code that the RSP check is intended to detect. IRET will clear DF atomically. ( Note: other than paravirt, there's little need for all this complexity. We could check RIP instead of RSP. ) Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c4ab384b84f..d8ab2b201fa1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1388,7 +1388,14 @@ ENTRY(nmi) /* * Now test if the previous stack was an NMI stack. This covers * the case where we interrupt an outer NMI after it clears - * "NMI executing" but before IRET. + * "NMI executing" but before IRET. We need to be careful, though: + * there is one case in which RSP could point to the NMI stack + * despite there being no NMI active: naughty userspace controls + * RSP at the very beginning of the SYSCALL targets. We can + * pull a fast one on naughty userspace, though: we program + * SYSCALL to mask DF, so userspace cannot cause DF to be set + * if it controls the kernel's RSP. We set DF before we clear + * "NMI executing". */ lea 6*8(%rsp), %rdx /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ @@ -1400,7 +1407,13 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ jb first_nmi - /* Ah, it is within the NMI stack, treat it as nested */ + + /* Ah, it is within the NMI stack. */ + + testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) + jz first_nmi /* RSP was user controlled. */ + + /* This is a nested NMI. */ nested_nmi: /* @@ -1506,8 +1519,16 @@ nmi_restore: /* Point RSP at the "iret" frame. */ REMOVE_PT_GPREGS_FROM_STACK 6*8 - /* Clear "NMI executing". */ - movq $0, 5*8(%rsp) + /* + * Clear "NMI executing". Set DF first so that we can easily + * distinguish the remaining code between here and IRET from + * the SYSCALL entry and exit paths. On a native kernel, we + * could just inspect RIP, but, on paravirt kernels, + * INTERRUPT_RETURN can translate into a jump into a + * hypercall page. + */ + std + movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* * INTERRUPT_RETURN reads the "iret" frame and exits the NMI -- cgit From 23a781e987f05029c4a99a5c145be3efa6eda9f3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:39 -0700 Subject: x86/nmi/64: Minor asm simplification Replace LEA; MOV with an equivalent SUB. This saves one instruction. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d8ab2b201fa1..0fb52526e452 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1420,8 +1420,7 @@ nested_nmi: * Modify the "iret" frame to point to repeat_nmi, forcing another * iteration of NMI handling. */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp + subq $8, %rsp leaq -10*8(%rsp), %rdx pushq $__KERNEL_DS pushq %rdx -- cgit From 36f1a77b3aa57c5c2eb1ae2d67d07c4350a78345 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:40 -0700 Subject: x86/nmi/64: Make the "NMI executing" variable more consistent Currently, "NMI executing" is one the first time an outermost NMI hits repeat_nmi and zero thereafter. Change it to be zero each time for consistency. This is intended to help NMI handling fail harder if it's buggy. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0fb52526e452..5422bd20bdf4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1441,8 +1441,8 @@ first_nmi: /* Restore rdx. */ movq (%rsp), %rdx - /* Set "NMI executing" on the stack. */ - pushq $1 + /* Make room for "NMI executing". */ + pushq $0 /* Leave room for the "iret" frame */ subq $(5*8), %rsp @@ -1467,11 +1467,10 @@ repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. - * - * Set "NMI executing" in case we came back here via IRET. + * gsbase if needed before we call do_nmi. "NMI executing" + * is zero. */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Set "NMI executing". */ /* * Copy the "outermost" frame to the "iret" frame. NMIs that nest -- cgit From a97439aa1aec10387797b4abae3cf117de1c90d7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:41 -0700 Subject: x86/entry/64, x86/nmi/64: Add CONFIG_DEBUG_ENTRY NMI testing code It turns out to be rather tedious to test the NMI nesting code. Make it easier: add a new CONFIG_DEBUG_ENTRY option that causes the NMI handler to pre-emptively unmask NMIs. With this option set, errors in the repeat_nmi logic or failures to detect that we're in a nested NMI will result in quick panics under perf (especially if multiple counters are running at high frequency) instead of requiring an unusual workload that generates page faults or breakpoints inside NMIs. I called it CONFIG_DEBUG_ENTRY instead of CONFIG_DEBUG_NMI_ENTRY because I want to add new non-NMI checks elsewhere in the entry code in the future, and I'd rather not add too many new config options or add this option and then immediately rename it. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 12 ++++++++++++ arch/x86/entry/entry_64.S | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86/entry') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index a15893d17c55..d8c0d3266173 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -297,6 +297,18 @@ config OPTIMIZE_INLINING If unsure, say N. +config DEBUG_ENTRY + bool "Debug low-level entry code" + depends on DEBUG_KERNEL + ---help--- + This option enables sanity checks in x86's low-level entry code. + Some of these sanity checks may slow down kernel entries and + exits or otherwise impact performance. + + This is currently used to help test NMI code. + + If unsure, say N. + config DEBUG_NMI_SELFTEST bool "NMI Selftest" depends on DEBUG_KERNEL && X86_LOCAL_APIC diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5422bd20bdf4..8cb3e438f21e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1454,6 +1454,21 @@ first_nmi: /* Everything up to here is safe from nested NMIs */ +#ifdef CONFIG_DEBUG_ENTRY + /* + * For ease of testing, unmask NMIs right away. Disabled by + * default because IRET is very expensive. + */ + pushq $0 /* SS */ + pushq %rsp /* RSP (minus 8 because of the previous push) */ + addq $8, (%rsp) /* Fix up RSP */ + pushfq /* RFLAGS */ + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ + INTERRUPT_RETURN /* continues at repeat_nmi below */ +1: +#endif + repeat_nmi: /* * If there was a nested NMI, the first NMI's iret will return -- cgit From c0c3322e98029e752526d906d9e3a680ed213c03 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Jul 2015 14:16:43 +0200 Subject: x86/asm/entry/32: Revert 'Do not use R9 in SYSCALL32' commit This change reverts most of commit 53e9accf0f 'Do not use R9 in SYSCALL32'. I don't yet understand how, but code in that commit sometimes fails to preserve EBP. See https://bugzilla.kernel.org/show_bug.cgi?id=101061 "Problems while executing 32-bit code on AMD64" Reported-and-tested-by: Krzysztof A. Sobiecki Signed-off-by: Denys Vlasenko Cc: Linus Torvalds Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Alexei Starovoitov Cc: Will Drewry Cc: Kees Cook CC: x86@kernel.org Link: http://lkml.kernel.org/r/1437740203-11552-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_64_compat.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86/entry') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bb187a6a877c..5a1844765a7a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -205,7 +205,6 @@ sysexit_from_sys_call: movl RDX(%rsp), %edx /* arg3 */ movl RSI(%rsp), %ecx /* arg4 */ movl RDI(%rsp), %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -236,6 +235,7 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -336,7 +336,7 @@ ENTRY(entry_SYSCALL_compat) * 32-bit zero extended: */ ASM_STAC -1: movl (%r8), %ebp +1: movl (%r8), %r9d _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -346,7 +346,7 @@ ENTRY(entry_SYSCALL_compat) cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ + /* r9 already loaded */ /* arg6 */ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ movl %ebx, %edi /* arg1 */ movl %edx, %edx /* arg3 (zero extension) */ @@ -358,7 +358,6 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: - movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -392,7 +391,9 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: + movl %r9d, R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common + movl R9(%rsp), %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -404,14 +405,16 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif + xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %rax, R9(%rsp) + movq %r9, R9(%rsp) movq %rax, R8(%rsp) movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter + movl R9(%rsp), %r9d /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -421,6 +424,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS + xchgl %ebp, %r9d jmp cstar_do_call END(entry_SYSCALL_compat) -- cgit