[amd64] Fix nested NMI handling (CVE-2015-3290, CVE-2015-3291, CVE-2015-5157)

svn path=/dists/sid/linux/; revision=22842
This commit is contained in:
Ben Hutchings 2015-07-22 20:46:22 +00:00
parent 2017c10b63
commit 561e869fda
11 changed files with 1022 additions and 0 deletions

10
debian/changelog vendored
View File

@ -1,7 +1,17 @@
linux (4.0.8-2) UNRELEASED; urgency=medium
[ Uwe Kleine-König ]
* [rt] Update to 4.0.8-rt6
[ Ben Hutchings ]
* [amd64] Fix nested NMI handling (CVE-2015-3290, CVE-2015-3291,
CVE-2015-5157)
- Enable nested do_nmi handling for 64-bit kernels
- Remove asm code that saves cr2
- Switch stacks on userspace NMI entry
- Reorder nested NMI checks
- Use DF to avoid userspace RSP confusing nested NMI detection
-- Uwe Kleine-König <uwe@kleine-koenig.org> Tue, 21 Jul 2015 23:19:12 +0200
linux (4.0.8-1) unstable; urgency=medium

View File

@ -0,0 +1,70 @@
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Wed, 1 Apr 2015 16:50:57 +0200
Subject: [1/9] x86/asm/entry/64: Fold the 'test_in_nmi' macro into its
only user
Origin: https://git.kernel.org/linus/0784b36448a2a85b95b6eb21a69b9045c896c065
No code changes.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427899858-7165-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/entry_64.S | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1462,19 +1462,7 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
-/*
- * Test if a given stack is an NMI stack or not.
- */
- .macro test_in_nmi reg stack nmi_ret normal_ret
- cmpq %\reg, \stack
- ja \normal_ret
- subq $EXCEPTION_STKSZ, %\reg
- cmpq %\reg, \stack
- jb \normal_ret
- jmp \nmi_ret
- .endm
-
- /* runs on exception stack */
+/* Runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1535,8 +1523,18 @@ ENTRY(nmi)
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/
- lea 6*8(%rsp), %rdx
- test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+ /* Ah, it is within the NMI stack, treat it as nested */
+ jmp nested_nmi
+
CFI_REMEMBER_STATE
nested_nmi:

View File

@ -0,0 +1,41 @@
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Tue, 7 Apr 2015 22:43:41 +0200
Subject: [2/9] x86/asm/entry/64: Remove a redundant jump
Origin: https://git.kernel.org/linus/a30b0085f54efae11f6256df4e4a16af7eefc1c4
Jumping to the very next instruction is not very useful:
jmp label
label:
Removing the jump.
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/entry_64.S | 1 -
1 file changed, 1 deletion(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1533,7 +1533,6 @@ ENTRY(nmi)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
/* Ah, it is within the NMI stack, treat it as nested */
- jmp nested_nmi
CFI_REMEMBER_STATE

View File

@ -0,0 +1,47 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 4 Jun 2015 13:24:29 -0700
Subject: [3/9] x86/asm/entry/64: Remove pointless jump to irq_return
Origin: https://git.kernel.org/linus/5ca6f70f387b4f82903037cc3c5488e2c97dcdbc
INTERRUPT_RETURN turns into a jmp instruction. There's no need
for extra indirection.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: <linux-kernel@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2f2318653dbad284a59311f13f08cea71298fd7c.1433449436.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/entry_64.S | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -862,8 +862,6 @@ retint_restore_args: /* return to kernel
TRACE_IRQS_IRETQ
restore_args:
RESTORE_ARGS 1,8,1
-
-irq_return:
INTERRUPT_RETURN
ENTRY(native_iret)
@@ -1708,7 +1706,7 @@ nmi_restore:
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)
- jmp irq_return
+ INTERRUPT_RETURN
CFI_ENDPROC
END(nmi)

View File

@ -0,0 +1,191 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:33 -0700
Subject: [4/9] x86/nmi: Enable nested do_nmi() handling for 64-bit kernels
Origin: https://git.kernel.org/linus/9d05041679904b12c12421cbcf9cb5f4860a8d7b
32-bit kernels handle nested NMIs in C. Enable the exact same
handling on 64-bit kernels as well. This isn't currently
necessary, but it will become necessary once the asm code starts
allowing limited nesting.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/nmi.c | 123 +++++++++++++++++++++-----------------------------
1 file changed, 52 insertions(+), 71 deletions(-)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_reg
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can hit breakpoints which will cause it to lose its NMI context
+ * with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
- *
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preproces
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);

View File

@ -0,0 +1,53 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:34 -0700
Subject: [5/9] x86/nmi/64: Remove asm code that saves CR2
Origin: https://git.kernel.org/linus/0e181bb58143cb4a2e8f01c281b0816cd0e4798e
Now that do_nmi saves CR2, we don't need to save it in asm.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/entry_64.S | 18 ------------------
1 file changed, 18 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1673,29 +1673,11 @@ end_repeat_nmi:
call save_paranoid
DEFAULT_FRAME 0
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
-
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:

View File

@ -0,0 +1,135 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:35 -0700
Subject: [6/9] x86/nmi/64: Switch stacks on userspace NMI entry
Origin: https://git.kernel.org/linus/9b6e6a8334d56354853f9c255d1395c2ba570e0a
Returning to userspace is tricky: IRET can fail, and ESPFIX can
rearrange the stack prior to IRET.
The NMI nesting fixup relies on a precise stack layout and
atomic IRET. Rather than trying to teach the NMI nesting fixup
to handle ESPFIX and failed IRET, punt: run NMIs that came from
user mode on the normal kernel stack.
This will make some nested NMIs visible to C code, but the C
code is okay with that.
As a side effect, this should speed up perf: it eliminates an
RDMSR when NMIs come from user mode.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0:
- Adjust filename, context
- s/restore_c_regs_and_iret/restore_args/
- Use kernel_stack + KERNEL_STACK_OFFSET instead of cpu_current_top_of_stack]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
[luto: Open-coded return path to avoid dependency on partial pt_regs details]
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
arch/x86/kernel/entry_64.S | 79 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 75 insertions(+), 4 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1494,19 +1494,90 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
CFI_REL_OFFSET rdx, 0
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ */
+
+ SWAPGS
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(kernel_stack), %rsp
+ addq $KERNEL_STACK_OFFSET, %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Open-code the entire return process for compatibility with varying
+ * register layouts across different kernel versions.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
+ addq $6*8, %rsp /* skip bx, bp, and r12-r15 */
+ popq %r11 /* pt_regs->r11 */
+ popq %r10 /* pt_regs->r10 */
+ popq %r9 /* pt_regs->r9 */
+ popq %r8 /* pt_regs->r8 */
+ popq %rax /* pt_regs->ax */
+ popq %rcx /* pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+ addq $8, %rsp /* skip orig_ax */
+ INTERRUPT_RETURN
+.Lnmi_from_kernel:
/*
* Check the special variable on the stack to see if NMIs are
* executing.

View File

@ -0,0 +1,285 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:36 -0700
Subject: [7/9] x86/nmi/64: Improve nested NMI comments
Origin: https://git.kernel.org/linus/0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b
I found the nested NMI documentation to be difficult to follow.
Improve the comments.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/entry_64.S | 159 ++++++++++++++++++++++++++-------------------
arch/x86/kernel/nmi.c | 4 +-
2 files changed, 93 insertions(+), 70 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1481,11 +1481,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1579,18 +1580,60 @@ ENTRY(nmi)
.Lnmi_from_kernel:
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
+ /*
+ * Determine whether we're a nested NMI.
+ *
+ * First check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET.
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1607,9 +1650,11 @@ ENTRY(nmi)
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * If we interrupted an NMI that is between repeat_nmi and
+ * end_repeat_nmi, then we must not modify the "iret" frame
+ * because it's being written by the outer NMI. That's okay;
+ * the outer NMI handler is about to call do_nmi anyway,
+ * so we can just resume the outer NMI.
*/
movq $repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
@@ -1619,7 +1664,10 @@ nested_nmi:
ja nested_nmi_out
1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ /*
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
+ */
leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 1*8
@@ -1638,60 +1686,23 @@ nested_nmi_out:
popq_cfi %rdx
CFI_RESTORE rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
CFI_RESTORE_STATE
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
CFI_RESTORE rdx
- /* Set the NMI executing variable on the stack. */
+ /* Set "NMI executing" on the stack. */
pushq_cfi $1
- /*
- * Leave room for the "copied" frame
- */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
CFI_ADJUST_CFA_OFFSET 5*8
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq_cfi 11*8(%rsp)
.endr
@@ -1699,6 +1710,7 @@ first_nmi:
/* Everything up to here is safe from nested NMIs */
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1707,16 +1719,21 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
- */
-repeat_nmi:
- /*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi.
+ *
+ * Set "NMI executing" in case we came back here via IRET.
*/
movq $1, 10*8(%rsp)
- /* Make another copy, this one may be modified by nested NMIs */
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
addq $(10*8), %rsp
CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
@@ -1727,9 +1744,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
@@ -1754,11 +1771,17 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- /* Pop the extra iret frame at once */
+
RESTORE_ALL 6*8
- /* Clear the NMI executing stack variable */
+ /* Clear "NMI executing". */
movq $0, 5*8(%rsp)
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
INTERRUPT_RETURN
CFI_ENDPROC
END(nmi)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_reg
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its NMI context
- * with the CPU when the breakpoint or page fault does an IRET.
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
*
* As a result, NMIs can nest if NMIs get unmasked due an IRET during
* NMI processing. On x86_64, the asm glue protects us from nested NMIs

View File

@ -0,0 +1,91 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:37 -0700
Subject: [8/9] x86/nmi/64: Reorder nested NMI checks
Origin: https://git.kernel.org/linus/a27507ca2d796cfa8d907de31ad730359c8a6d06
Check the repeat_nmi .. end_repeat_nmi special case first. The
next patch will rework the RSP check and, as a side effect, the
RSP check will no longer detect repeat_nmi .. end_repeat_nmi, so
we'll need this ordering of the checks.
Note: this is more subtle than it appears. The check for
repeat_nmi .. end_repeat_nmi jumps straight out of the NMI code
instead of adjusting the "iret" frame to force a repeat. This
is necessary, because the code between repeat_nmi and
end_repeat_nmi sets "NMI executing" and then writes to the
"iret" frame itself. If a nested NMI comes in and modifies the
"iret" frame while repeat_nmi is also modifying it, we'll end up
with garbage. The old code got this right, as does the new
code, but the new code is a bit more explicit.
If we were to move the check right after the "NMI executing"
check, then we'd get it wrong and have random crashes.
( Because the "NMI executing" check would jump to the code that would
modify the "iret" frame without checking if the interrupted NMI was
currently modifying it. )
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, spacing]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/entry_64.S | 34 ++++++++++++++++++----------------
1 file changed, 18 insertions(+), 16 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1623,7 +1623,24 @@ ENTRY(nmi)
/*
* Determine whether we're a nested NMI.
*
- * First check "NMI executing". If it's set, then we're nested.
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
+ */
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
+
+ /*
+ * Now check "NMI executing". If it's set, then we're nested.
* This will not detect if we interrupted an outer NMI just
* before IRET.
*/
@@ -1650,21 +1667,6 @@ ENTRY(nmi)
nested_nmi:
/*
- * If we interrupted an NMI that is between repeat_nmi and
- * end_repeat_nmi, then we must not modify the "iret" frame
- * because it's being written by the outer NMI. That's okay;
- * the outer NMI handler is about to call do_nmi anyway,
- * so we can just resume the outer NMI.
- */
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /*
* Modify the "iret" frame to point to repeat_nmi, forcing another
* iteration of NMI handling.
*/

View File

@ -0,0 +1,90 @@
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 10:29:38 -0700
Subject: x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI
detection
Origin: https://git.kernel.org/linus/810bc075f78ff2c221536eb3008eac6a492dba2d
We have a tricky bug in the nested NMI code: if we see RSP
pointing to the NMI stack on NMI entry from kernel mode, we
assume that we are executing a nested NMI.
This isn't quite true. A malicious userspace program can point
RSP at the NMI stack, issue SYSCALL, and arrange for an NMI to
happen while RSP is still pointing at the NMI stack.
Fix it with a sneaky trick. Set DF in the region of code that
the RSP check is intended to detect. IRET will clear DF
atomically.
( Note: other than paravirt, there's little need for all this
complexity. We could check RIP instead of RSP. )
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, context]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
arch/x86/kernel/entry_64.S | 29 +++++++++++++++++++++++++----
1 file changed, 25 insertions(+), 4 deletions(-)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1650,7 +1650,14 @@ ENTRY(nmi)
/*
* Now test if the previous stack was an NMI stack. This covers
* the case where we interrupt an outer NMI after it clears
- * "NMI executing" but before IRET.
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1661,10 +1668,16 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
- /* Ah, it is within the NMI stack, treat it as nested */
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
CFI_REMEMBER_STATE
+ /* This is a nested NMI. */
+
nested_nmi:
/*
* Modify the "iret" frame to point to repeat_nmi, forcing another
@@ -1776,8 +1789,16 @@ nmi_restore:
RESTORE_ALL 6*8
- /* Clear "NMI executing". */
- movq $0, 5*8(%rsp)
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
* INTERRUPT_RETURN reads the "iret" frame and exits the NMI

View File

@ -82,3 +82,12 @@ debian/udp-fix-abi-change-in-4.0.6.patch
bugfix/mips/mips-normalise-code-flow-in-the-cpu-exception-handle.patch
bugfix/mips/mips-correct-fp-isa-requirements.patch
bugfix/x86/kvm-x86-fix-kvm_apic_has_events-to-check-for-null-po.patch
bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch
bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch
bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch
bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch
bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch
bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch
bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch
bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch
bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch