diff --git a/debian/changelog b/debian/changelog index f2612c41e..84e378d5c 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,7 +1,17 @@ linux (4.0.8-2) UNRELEASED; urgency=medium + [ Uwe Kleine-König ] * [rt] Update to 4.0.8-rt6 + [ Ben Hutchings ] + * [amd64] Fix nested NMI handling (CVE-2015-3290, CVE-2015-3291, + CVE-2015-5157) + - Enable nested do_nmi handling for 64-bit kernels + - Remove asm code that saves cr2 + - Switch stacks on userspace NMI entry + - Reorder nested NMI checks + - Use DF to avoid userspace RSP confusing nested NMI detection + -- Uwe Kleine-König Tue, 21 Jul 2015 23:19:12 +0200 linux (4.0.8-1) unstable; urgency=medium diff --git a/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch b/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch new file mode 100644 index 000000000..2744ca48c --- /dev/null +++ b/debian/patches/bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch @@ -0,0 +1,70 @@ +From: Denys Vlasenko +Date: Wed, 1 Apr 2015 16:50:57 +0200 +Subject: [1/9] x86/asm/entry/64: Fold the 'test_in_nmi' macro into its + only user +Origin: https://git.kernel.org/linus/0784b36448a2a85b95b6eb21a69b9045c896c065 + +No code changes. + +Signed-off-by: Denys Vlasenko +Acked-by: Borislav Petkov +Cc: Alexei Starovoitov +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Frederic Weisbecker +Cc: H. Peter Anvin +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Oleg Nesterov +Cc: Steven Rostedt +Cc: Will Drewry +Link: http://lkml.kernel.org/r/1427899858-7165-1-git-send-email-dvlasenk@redhat.com +Signed-off-by: Ingo Molnar +--- + arch/x86/kernel/entry_64.S | 28 +++++++++++++--------------- + 1 file changed, 13 insertions(+), 15 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1462,19 +1462,7 @@ ENTRY(error_exit) + CFI_ENDPROC + END(error_exit) + +-/* +- * Test if a given stack is an NMI stack or not. +- */ +- .macro test_in_nmi reg stack nmi_ret normal_ret +- cmpq %\reg, \stack +- ja \normal_ret +- subq $EXCEPTION_STKSZ, %\reg +- cmpq %\reg, \stack +- jb \normal_ret +- jmp \nmi_ret +- .endm +- +- /* runs on exception stack */ ++/* Runs on exception stack */ + ENTRY(nmi) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME +@@ -1535,8 +1523,18 @@ ENTRY(nmi) + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ +- lea 6*8(%rsp), %rdx +- test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi ++ lea 6*8(%rsp), %rdx ++ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ ++ cmpq %rdx, 4*8(%rsp) ++ /* If the stack pointer is above the NMI stack, this is a normal NMI */ ++ ja first_nmi ++ subq $EXCEPTION_STKSZ, %rdx ++ cmpq %rdx, 4*8(%rsp) ++ /* If it is below the NMI stack, it is a normal NMI */ ++ jb first_nmi ++ /* Ah, it is within the NMI stack, treat it as nested */ ++ jmp nested_nmi ++ + CFI_REMEMBER_STATE + + nested_nmi: diff --git a/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch b/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch new file mode 100644 index 000000000..b04ce3a7b --- /dev/null +++ b/debian/patches/bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch @@ -0,0 +1,41 @@ +From: Denys Vlasenko +Date: Tue, 7 Apr 2015 22:43:41 +0200 +Subject: [2/9] x86/asm/entry/64: Remove a redundant jump +Origin: https://git.kernel.org/linus/a30b0085f54efae11f6256df4e4a16af7eefc1c4 + +Jumping to the very next instruction is not very useful: + + jmp label + label: + +Removing the jump. + +Signed-off-by: Denys Vlasenko +Cc: Alexei Starovoitov +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Frederic Weisbecker +Cc: H. Peter Anvin +Cc: Kees Cook +Cc: Linus Torvalds +Cc: Oleg Nesterov +Cc: Steven Rostedt +Cc: Thomas Gleixner +Cc: Will Drewry +Link: http://lkml.kernel.org/r/1428439424-7258-5-git-send-email-dvlasenk@redhat.com +Signed-off-by: Ingo Molnar +--- + arch/x86/kernel/entry_64.S | 1 - + 1 file changed, 1 deletion(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1533,7 +1533,6 @@ ENTRY(nmi) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ +- jmp nested_nmi + + CFI_REMEMBER_STATE + diff --git a/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch b/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch new file mode 100644 index 000000000..af71d5282 --- /dev/null +++ b/debian/patches/bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch @@ -0,0 +1,47 @@ +From: Andy Lutomirski +Date: Thu, 4 Jun 2015 13:24:29 -0700 +Subject: [3/9] x86/asm/entry/64: Remove pointless jump to irq_return +Origin: https://git.kernel.org/linus/5ca6f70f387b4f82903037cc3c5488e2c97dcdbc + +INTERRUPT_RETURN turns into a jmp instruction. There's no need +for extra indirection. + +Signed-off-by: Andy Lutomirski +Cc: +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/2f2318653dbad284a59311f13f08cea71298fd7c.1433449436.git.luto@kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: adjust filename, context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kernel/entry_64.S | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -862,8 +862,6 @@ retint_restore_args: /* return to kernel + TRACE_IRQS_IRETQ + restore_args: + RESTORE_ARGS 1,8,1 +- +-irq_return: + INTERRUPT_RETURN + + ENTRY(native_iret) +@@ -1708,7 +1706,7 @@ nmi_restore: + + /* Clear the NMI executing stack variable */ + movq $0, 5*8(%rsp) +- jmp irq_return ++ INTERRUPT_RETURN + CFI_ENDPROC + END(nmi) + diff --git a/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch b/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch new file mode 100644 index 000000000..cb18ec247 --- /dev/null +++ b/debian/patches/bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch @@ -0,0 +1,191 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:33 -0700 +Subject: [4/9] x86/nmi: Enable nested do_nmi() handling for 64-bit kernels +Origin: https://git.kernel.org/linus/9d05041679904b12c12421cbcf9cb5f4860a8d7b + +32-bit kernels handle nested NMIs in C. Enable the exact same +handling on 64-bit kernels as well. This isn't currently +necessary, but it will become necessary once the asm code starts +allowing limited nesting. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +--- + arch/x86/kernel/nmi.c | 123 +++++++++++++++++++++----------------------------- + 1 file changed, 52 insertions(+), 71 deletions(-) + +--- a/arch/x86/kernel/nmi.c ++++ b/arch/x86/kernel/nmi.c +@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_reg + NOKPROBE_SYMBOL(default_do_nmi); + + /* +- * NMIs can hit breakpoints which will cause it to lose its +- * NMI context with the CPU when the breakpoint does an iret. +- */ +-#ifdef CONFIG_X86_32 +-/* +- * For i386, NMIs use the same stack as the kernel, and we can +- * add a workaround to the iret problem in C (preventing nested +- * NMIs if an NMI takes a trap). Simply have 3 states the NMI +- * can be in: ++ * NMIs can hit breakpoints which will cause it to lose its NMI context ++ * with the CPU when the breakpoint or page fault does an IRET. ++ * ++ * As a result, NMIs can nest if NMIs get unmasked due an IRET during ++ * NMI processing. On x86_64, the asm glue protects us from nested NMIs ++ * if the outer NMI came from kernel mode, but we can still nest if the ++ * outer NMI came from user mode. ++ * ++ * To handle these nested NMIs, we have three states: + * + * 1) not running + * 2) executing +@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi); + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * +- * If an NMI hits a breakpoint that executes an iret, another +- * NMI can preempt it. We do not want to allow this new NMI +- * to run, but we want to execute it when the first one finishes. +- * We set the state to "latched", and the exit of the first NMI will +- * perform a dec_return, if the result is zero (NOT_RUNNING), then +- * it will simply exit the NMI handler. If not, the dec_return +- * would have set the state to NMI_EXECUTING (what we want it to +- * be when we are running). In this case, we simply jump back +- * to rerun the NMI handler again, and restart the 'latched' NMI. ++ * If an NMI executes an iret, another NMI can preempt it. We do not ++ * want to allow this new NMI to run, but we want to execute it when the ++ * first one finishes. We set the state to "latched", and the exit of ++ * the first NMI will perform a dec_return, if the result is zero ++ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the ++ * dec_return would have set the state to NMI_EXECUTING (what we want it ++ * to be when we are running). In this case, we simply jump back to ++ * rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING +@@ -461,49 +460,36 @@ enum nmi_states { + static DEFINE_PER_CPU(enum nmi_states, nmi_state); + static DEFINE_PER_CPU(unsigned long, nmi_cr2); + +-#define nmi_nesting_preprocess(regs) \ +- do { \ +- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ +- this_cpu_write(nmi_state, NMI_LATCHED); \ +- return; \ +- } \ +- this_cpu_write(nmi_state, NMI_EXECUTING); \ +- this_cpu_write(nmi_cr2, read_cr2()); \ +- } while (0); \ +- nmi_restart: +- +-#define nmi_nesting_postprocess() \ +- do { \ +- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ +- write_cr2(this_cpu_read(nmi_cr2)); \ +- if (this_cpu_dec_return(nmi_state)) \ +- goto nmi_restart; \ +- } while (0) +-#else /* x86_64 */ ++#ifdef CONFIG_X86_64 + /* +- * In x86_64 things are a bit more difficult. This has the same problem +- * where an NMI hitting a breakpoint that calls iret will remove the +- * NMI context, allowing a nested NMI to enter. What makes this more +- * difficult is that both NMIs and breakpoints have their own stack. +- * When a new NMI or breakpoint is executed, the stack is set to a fixed +- * point. If an NMI is nested, it will have its stack set at that same +- * fixed address that the first NMI had, and will start corrupting the +- * stack. This is handled in entry_64.S, but the same problem exists with +- * the breakpoint stack. +- * +- * If a breakpoint is being processed, and the debug stack is being used, +- * if an NMI comes in and also hits a breakpoint, the stack pointer +- * will be set to the same fixed address as the breakpoint that was +- * interrupted, causing that stack to be corrupted. To handle this case, +- * check if the stack that was interrupted is the debug stack, and if +- * so, change the IDT so that new breakpoints will use the current stack +- * and not switch to the fixed address. On return of the NMI, switch back +- * to the original IDT. ++ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without ++ * some care, the inner breakpoint will clobber the outer breakpoint's ++ * stack. ++ * ++ * If a breakpoint is being processed, and the debug stack is being ++ * used, if an NMI comes in and also hits a breakpoint, the stack ++ * pointer will be set to the same fixed address as the breakpoint that ++ * was interrupted, causing that stack to be corrupted. To handle this ++ * case, check if the stack that was interrupted is the debug stack, and ++ * if so, change the IDT so that new breakpoints will use the current ++ * stack and not switch to the fixed address. On return of the NMI, ++ * switch back to the original IDT. + */ + static DEFINE_PER_CPU(int, update_debug_stack); ++#endif + +-static inline void nmi_nesting_preprocess(struct pt_regs *regs) ++dotraplinkage notrace void ++do_nmi(struct pt_regs *regs, long error_code) + { ++ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { ++ this_cpu_write(nmi_state, NMI_LATCHED); ++ return; ++ } ++ this_cpu_write(nmi_state, NMI_EXECUTING); ++ this_cpu_write(nmi_cr2, read_cr2()); ++nmi_restart: ++ ++#ifdef CONFIG_X86_64 + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to +@@ -514,22 +500,8 @@ static inline void nmi_nesting_preproces + debug_stack_set_zero(); + this_cpu_write(update_debug_stack, 1); + } +-} +- +-static inline void nmi_nesting_postprocess(void) +-{ +- if (unlikely(this_cpu_read(update_debug_stack))) { +- debug_stack_reset(); +- this_cpu_write(update_debug_stack, 0); +- } +-} + #endif + +-dotraplinkage notrace void +-do_nmi(struct pt_regs *regs, long error_code) +-{ +- nmi_nesting_preprocess(regs); +- + nmi_enter(); + + inc_irq_stat(__nmi_count); +@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_ + + nmi_exit(); + +- /* On i386, may loop back to preprocess */ +- nmi_nesting_postprocess(); ++#ifdef CONFIG_X86_64 ++ if (unlikely(this_cpu_read(update_debug_stack))) { ++ debug_stack_reset(); ++ this_cpu_write(update_debug_stack, 0); ++ } ++#endif ++ ++ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) ++ write_cr2(this_cpu_read(nmi_cr2)); ++ if (this_cpu_dec_return(nmi_state)) ++ goto nmi_restart; + } + NOKPROBE_SYMBOL(do_nmi); + diff --git a/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch b/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch new file mode 100644 index 000000000..33fdec1ef --- /dev/null +++ b/debian/patches/bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch @@ -0,0 +1,53 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:34 -0700 +Subject: [5/9] x86/nmi/64: Remove asm code that saves CR2 +Origin: https://git.kernel.org/linus/0e181bb58143cb4a2e8f01c281b0816cd0e4798e + +Now that do_nmi saves CR2, we don't need to save it in asm. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Acked-by: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: adjust filename, context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kernel/entry_64.S | 18 ------------------ + 1 file changed, 18 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1673,29 +1673,11 @@ end_repeat_nmi: + call save_paranoid + DEFAULT_FRAME 0 + +- /* +- * Save off the CR2 register. If we take a page fault in the NMI then +- * it could corrupt the CR2 value. If the NMI preempts a page fault +- * handler before it was able to read the CR2 register, and then the +- * NMI itself takes a page fault, the page fault that was preempted +- * will read the information from the NMI page fault and not the +- * origin fault. Save it off and restore it if it changes. +- * Use the r12 callee-saved register. +- */ +- movq %cr2, %r12 +- + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi + +- /* Did the NMI take a page fault? Restore cr2 if it did */ +- movq %cr2, %rcx +- cmpq %rcx, %r12 +- je 1f +- movq %r12, %cr2 +-1: +- + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: diff --git a/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch b/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch new file mode 100644 index 000000000..168bf3d3c --- /dev/null +++ b/debian/patches/bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch @@ -0,0 +1,135 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:35 -0700 +Subject: [6/9] x86/nmi/64: Switch stacks on userspace NMI entry +Origin: https://git.kernel.org/linus/9b6e6a8334d56354853f9c255d1395c2ba570e0a + +Returning to userspace is tricky: IRET can fail, and ESPFIX can +rearrange the stack prior to IRET. + +The NMI nesting fixup relies on a precise stack layout and +atomic IRET. Rather than trying to teach the NMI nesting fixup +to handle ESPFIX and failed IRET, punt: run NMIs that came from +user mode on the normal kernel stack. + +This will make some nested NMIs visible to C code, but the C +code is okay with that. + +As a side effect, this should speed up perf: it eliminates an +RDMSR when NMIs come from user mode. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Reviewed-by: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: + - Adjust filename, context + - s/restore_c_regs_and_iret/restore_args/ + - Use kernel_stack + KERNEL_STACK_OFFSET instead of cpu_current_top_of_stack] +Signed-off-by: Ben Hutchings +[luto: Open-coded return path to avoid dependency on partial pt_regs details] +Signed-off-by: Andy Lutomirski +--- + arch/x86/kernel/entry_64.S | 79 +++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 75 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1494,19 +1494,90 @@ ENTRY(nmi) + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. ++ * ++ * However, espfix prevents us from directly returning to userspace ++ * with a single IRET instruction. Similarly, IRET to user mode ++ * can fault. We therefore handle NMIs from user space like ++ * other IST entries. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 + ++ testb $3, CS-RIP+8(%rsp) ++ jz .Lnmi_from_kernel ++ ++ /* ++ * NMI from user mode. We need to run on the thread stack, but we ++ * can't go through the normal entry paths: NMIs are masked, and ++ * we don't want to enable interrupts, because then we'll end ++ * up in an awkward situation in which IRQs are on but NMIs ++ * are off. ++ */ ++ ++ SWAPGS ++ cld ++ movq %rsp, %rdx ++ movq PER_CPU_VAR(kernel_stack), %rsp ++ addq $KERNEL_STACK_OFFSET, %rsp ++ pushq 5*8(%rdx) /* pt_regs->ss */ ++ pushq 4*8(%rdx) /* pt_regs->rsp */ ++ pushq 3*8(%rdx) /* pt_regs->flags */ ++ pushq 2*8(%rdx) /* pt_regs->cs */ ++ pushq 1*8(%rdx) /* pt_regs->rip */ ++ pushq $-1 /* pt_regs->orig_ax */ ++ pushq %rdi /* pt_regs->di */ ++ pushq %rsi /* pt_regs->si */ ++ pushq (%rdx) /* pt_regs->dx */ ++ pushq %rcx /* pt_regs->cx */ ++ pushq %rax /* pt_regs->ax */ ++ pushq %r8 /* pt_regs->r8 */ ++ pushq %r9 /* pt_regs->r9 */ ++ pushq %r10 /* pt_regs->r10 */ ++ pushq %r11 /* pt_regs->r11 */ ++ pushq %rbx /* pt_regs->rbx */ ++ pushq %rbp /* pt_regs->rbp */ ++ pushq %r12 /* pt_regs->r12 */ ++ pushq %r13 /* pt_regs->r13 */ ++ pushq %r14 /* pt_regs->r14 */ ++ pushq %r15 /* pt_regs->r15 */ ++ ++ /* ++ * At this point we no longer need to worry about stack damage ++ * due to nesting -- we're on the normal thread stack and we're ++ * done with the NMI stack. ++ */ ++ ++ movq %rsp, %rdi ++ movq $-1, %rsi ++ call do_nmi ++ ++ /* ++ * Return back to user mode. We must *not* do the normal exit ++ * work, because we don't want to enable interrupts. Fortunately, ++ * do_nmi doesn't modify pt_regs. ++ */ ++ SWAPGS ++ + /* +- * If %cs was not the kernel segment, then the NMI triggered in user +- * space, which means it is definitely not nested. ++ * Open-code the entire return process for compatibility with varying ++ * register layouts across different kernel versions. + */ +- cmpl $__KERNEL_CS, 16(%rsp) +- jne first_nmi ++ addq $6*8, %rsp /* skip bx, bp, and r12-r15 */ ++ popq %r11 /* pt_regs->r11 */ ++ popq %r10 /* pt_regs->r10 */ ++ popq %r9 /* pt_regs->r9 */ ++ popq %r8 /* pt_regs->r8 */ ++ popq %rax /* pt_regs->ax */ ++ popq %rcx /* pt_regs->cx */ ++ popq %rdx /* pt_regs->dx */ ++ popq %rsi /* pt_regs->si */ ++ popq %rdi /* pt_regs->di */ ++ addq $8, %rsp /* skip orig_ax */ ++ INTERRUPT_RETURN + ++.Lnmi_from_kernel: + /* + * Check the special variable on the stack to see if NMIs are + * executing. diff --git a/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch b/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch new file mode 100644 index 000000000..a5e18861e --- /dev/null +++ b/debian/patches/bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch @@ -0,0 +1,285 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:36 -0700 +Subject: [7/9] x86/nmi/64: Improve nested NMI comments +Origin: https://git.kernel.org/linus/0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b + +I found the nested NMI documentation to be difficult to follow. +Improve the comments. + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: adjust filename, context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kernel/entry_64.S | 159 ++++++++++++++++++++++++++------------------- + arch/x86/kernel/nmi.c | 4 +- + 2 files changed, 93 insertions(+), 70 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1481,11 +1481,12 @@ ENTRY(nmi) + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack +- * o Copy the interrupt frame into a "saved" location on the stack +- * o Copy the interrupt frame into a "copy" location on the stack ++ * o Copy the interrupt frame into an "outermost" location on the ++ * stack ++ * o Copy the interrupt frame into an "iret" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: +- * o Modify the "copy" location to jump to the repeate_nmi ++ * o Modify the "iret" location to jump to the repeat_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable +@@ -1579,18 +1580,60 @@ ENTRY(nmi) + + .Lnmi_from_kernel: + /* +- * Check the special variable on the stack to see if NMIs are +- * executing. ++ * Here's what our stack frame will look like: ++ * +---------------------------------------------------------+ ++ * | original SS | ++ * | original Return RSP | ++ * | original RFLAGS | ++ * | original CS | ++ * | original RIP | ++ * +---------------------------------------------------------+ ++ * | temp storage for rdx | ++ * +---------------------------------------------------------+ ++ * | "NMI executing" variable | ++ * +---------------------------------------------------------+ ++ * | iret SS } Copied from "outermost" frame | ++ * | iret Return RSP } on each loop iteration; overwritten | ++ * | iret RFLAGS } by a nested NMI to force another | ++ * | iret CS } iteration if needed. | ++ * | iret RIP } | ++ * +---------------------------------------------------------+ ++ * | outermost SS } initialized in first_nmi; | ++ * | outermost Return RSP } will not be changed before | ++ * | outermost RFLAGS } NMI processing is done. | ++ * | outermost CS } Copied to "iret" frame on each | ++ * | outermost RIP } iteration. | ++ * +---------------------------------------------------------+ ++ * | pt_regs | ++ * +---------------------------------------------------------+ ++ * ++ * The "original" frame is used by hardware. Before re-enabling ++ * NMIs, we need to be done with it, and we need to leave enough ++ * space for the asm code here. ++ * ++ * We return by executing IRET while RSP points to the "iret" frame. ++ * That will either return for real or it will loop back into NMI ++ * processing. ++ * ++ * The "outermost" frame is copied to the "iret" frame on each ++ * iteration of the loop, so each iteration starts with the "iret" ++ * frame pointing to the final return target. ++ */ ++ ++ /* ++ * Determine whether we're a nested NMI. ++ * ++ * First check "NMI executing". If it's set, then we're nested. ++ * This will not detect if we interrupted an outer NMI just ++ * before IRET. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* +- * Now test if the previous stack was an NMI stack. +- * We need the double check. We check the NMI stack to satisfy the +- * race when the first NMI clears the variable before returning. +- * We check the variable because the first NMI could be in a +- * breakpoint routine using a breakpoint stack. ++ * Now test if the previous stack was an NMI stack. This covers ++ * the case where we interrupt an outer NMI after it clears ++ * "NMI executing" but before IRET. + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ +@@ -1607,9 +1650,11 @@ ENTRY(nmi) + + nested_nmi: + /* +- * Do nothing if we interrupted the fixup in repeat_nmi. +- * It's about to repeat the NMI handler, so we are fine +- * with ignoring this one. ++ * If we interrupted an NMI that is between repeat_nmi and ++ * end_repeat_nmi, then we must not modify the "iret" frame ++ * because it's being written by the outer NMI. That's okay; ++ * the outer NMI handler is about to call do_nmi anyway, ++ * so we can just resume the outer NMI. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx +@@ -1619,7 +1664,10 @@ nested_nmi: + ja nested_nmi_out + + 1: +- /* Set up the interrupted NMIs stack to jump to repeat_nmi */ ++ /* ++ * Modify the "iret" frame to point to repeat_nmi, forcing another ++ * iteration of NMI handling. ++ */ + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 1*8 +@@ -1638,60 +1686,23 @@ nested_nmi_out: + popq_cfi %rdx + CFI_RESTORE rdx + +- /* No need to check faults here */ ++ /* We are returning to kernel mode, so this cannot result in a fault. */ + INTERRUPT_RETURN + + CFI_RESTORE_STATE + first_nmi: +- /* +- * Because nested NMIs will use the pushed location that we +- * stored in rdx, we must keep that space available. +- * Here's what our stack frame will look like: +- * +-------------------------+ +- * | original SS | +- * | original Return RSP | +- * | original RFLAGS | +- * | original CS | +- * | original RIP | +- * +-------------------------+ +- * | temp storage for rdx | +- * +-------------------------+ +- * | NMI executing variable | +- * +-------------------------+ +- * | copied SS | +- * | copied Return RSP | +- * | copied RFLAGS | +- * | copied CS | +- * | copied RIP | +- * +-------------------------+ +- * | Saved SS | +- * | Saved Return RSP | +- * | Saved RFLAGS | +- * | Saved CS | +- * | Saved RIP | +- * +-------------------------+ +- * | pt_regs | +- * +-------------------------+ +- * +- * The saved stack frame is used to fix up the copied stack frame +- * that a nested NMI may change to make the interrupted NMI iret jump +- * to the repeat_nmi. The original stack frame and the temp storage +- * is also used by nested NMIs and can not be trusted on exit. +- */ +- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ ++ /* Restore rdx. */ + movq (%rsp), %rdx + CFI_RESTORE rdx + +- /* Set the NMI executing variable on the stack. */ ++ /* Set "NMI executing" on the stack. */ + pushq_cfi $1 + +- /* +- * Leave room for the "copied" frame +- */ ++ /* Leave room for the "iret" frame */ + subq $(5*8), %rsp + CFI_ADJUST_CFA_OFFSET 5*8 + +- /* Copy the stack frame to the Saved frame */ ++ /* Copy the "original" frame to the "outermost" frame */ + .rept 5 + pushq_cfi 11*8(%rsp) + .endr +@@ -1699,6 +1710,7 @@ first_nmi: + + /* Everything up to here is safe from nested NMIs */ + ++repeat_nmi: + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another +@@ -1707,16 +1719,21 @@ first_nmi: + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. +- */ +-repeat_nmi: +- /* +- * Update the stack variable to say we are still in NMI (the update +- * is benign for the non-repeat case, where 1 was pushed just above +- * to this very stack slot). ++ * ++ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if ++ * we're repeating an NMI, gsbase has the same value that it had on ++ * the first iteration. paranoid_entry will load the kernel ++ * gsbase if needed before we call do_nmi. ++ * ++ * Set "NMI executing" in case we came back here via IRET. + */ + movq $1, 10*8(%rsp) + +- /* Make another copy, this one may be modified by nested NMIs */ ++ /* ++ * Copy the "outermost" frame to the "iret" frame. NMIs that nest ++ * here must not modify the "iret" frame while we're writing to ++ * it or it will end up containing garbage. ++ */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 + .rept 5 +@@ -1727,9 +1744,9 @@ repeat_nmi: + end_repeat_nmi: + + /* +- * Everything below this point can be preempted by a nested +- * NMI if the first NMI took an exception and reset our iret stack +- * so that we repeat another NMI. ++ * Everything below this point can be preempted by a nested NMI. ++ * If this happens, then the inner NMI will change the "iret" ++ * frame to point back to repeat_nmi. + */ + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp +@@ -1754,11 +1771,17 @@ end_repeat_nmi: + nmi_swapgs: + SWAPGS_UNSAFE_STACK + nmi_restore: +- /* Pop the extra iret frame at once */ ++ + RESTORE_ALL 6*8 + +- /* Clear the NMI executing stack variable */ ++ /* Clear "NMI executing". */ + movq $0, 5*8(%rsp) ++ ++ /* ++ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI ++ * stack in a single instruction. We are returning to kernel ++ * mode, so this cannot result in a fault. ++ */ + INTERRUPT_RETURN + CFI_ENDPROC + END(nmi) +--- a/arch/x86/kernel/nmi.c ++++ b/arch/x86/kernel/nmi.c +@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_reg + NOKPROBE_SYMBOL(default_do_nmi); + + /* +- * NMIs can hit breakpoints which will cause it to lose its NMI context +- * with the CPU when the breakpoint or page fault does an IRET. ++ * NMIs can page fault or hit breakpoints which will cause it to lose ++ * its NMI context with the CPU when the breakpoint or page fault does an IRET. + * + * As a result, NMIs can nest if NMIs get unmasked due an IRET during + * NMI processing. On x86_64, the asm glue protects us from nested NMIs diff --git a/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch b/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch new file mode 100644 index 000000000..68357e1c1 --- /dev/null +++ b/debian/patches/bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch @@ -0,0 +1,91 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:37 -0700 +Subject: [8/9] x86/nmi/64: Reorder nested NMI checks +Origin: https://git.kernel.org/linus/a27507ca2d796cfa8d907de31ad730359c8a6d06 + +Check the repeat_nmi .. end_repeat_nmi special case first. The +next patch will rework the RSP check and, as a side effect, the +RSP check will no longer detect repeat_nmi .. end_repeat_nmi, so +we'll need this ordering of the checks. + +Note: this is more subtle than it appears. The check for +repeat_nmi .. end_repeat_nmi jumps straight out of the NMI code +instead of adjusting the "iret" frame to force a repeat. This +is necessary, because the code between repeat_nmi and +end_repeat_nmi sets "NMI executing" and then writes to the +"iret" frame itself. If a nested NMI comes in and modifies the +"iret" frame while repeat_nmi is also modifying it, we'll end up +with garbage. The old code got this right, as does the new +code, but the new code is a bit more explicit. + +If we were to move the check right after the "NMI executing" +check, then we'd get it wrong and have random crashes. + +( Because the "NMI executing" check would jump to the code that would + modify the "iret" frame without checking if the interrupted NMI was + currently modifying it. ) + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: adjust filename, spacing] +Signed-off-by: Ben Hutchings +--- + arch/x86/kernel/entry_64.S | 34 ++++++++++++++++++---------------- + 1 file changed, 18 insertions(+), 16 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1623,7 +1623,24 @@ ENTRY(nmi) + /* + * Determine whether we're a nested NMI. + * +- * First check "NMI executing". If it's set, then we're nested. ++ * If we interrupted kernel code between repeat_nmi and ++ * end_repeat_nmi, then we are a nested NMI. We must not ++ * modify the "iret" frame because it's being written by ++ * the outer NMI. That's okay; the outer NMI handler is ++ * about to about to call do_nmi anyway, so we can just ++ * resume the outer NMI. ++ */ ++ ++ movq $repeat_nmi, %rdx ++ cmpq 8(%rsp), %rdx ++ ja 1f ++ movq $end_repeat_nmi, %rdx ++ cmpq 8(%rsp), %rdx ++ ja nested_nmi_out ++1: ++ ++ /* ++ * Now check "NMI executing". If it's set, then we're nested. + * This will not detect if we interrupted an outer NMI just + * before IRET. + */ +@@ -1650,21 +1667,6 @@ ENTRY(nmi) + + nested_nmi: + /* +- * If we interrupted an NMI that is between repeat_nmi and +- * end_repeat_nmi, then we must not modify the "iret" frame +- * because it's being written by the outer NMI. That's okay; +- * the outer NMI handler is about to call do_nmi anyway, +- * so we can just resume the outer NMI. +- */ +- movq $repeat_nmi, %rdx +- cmpq 8(%rsp), %rdx +- ja 1f +- movq $end_repeat_nmi, %rdx +- cmpq 8(%rsp), %rdx +- ja nested_nmi_out +- +-1: +- /* + * Modify the "iret" frame to point to repeat_nmi, forcing another + * iteration of NMI handling. + */ diff --git a/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch b/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch new file mode 100644 index 000000000..1d4bf0a75 --- /dev/null +++ b/debian/patches/bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch @@ -0,0 +1,90 @@ +From: Andy Lutomirski +Date: Wed, 15 Jul 2015 10:29:38 -0700 +Subject: x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI + detection +Origin: https://git.kernel.org/linus/810bc075f78ff2c221536eb3008eac6a492dba2d + +We have a tricky bug in the nested NMI code: if we see RSP +pointing to the NMI stack on NMI entry from kernel mode, we +assume that we are executing a nested NMI. + +This isn't quite true. A malicious userspace program can point +RSP at the NMI stack, issue SYSCALL, and arrange for an NMI to +happen while RSP is still pointing at the NMI stack. + +Fix it with a sneaky trick. Set DF in the region of code that +the RSP check is intended to detect. IRET will clear DF +atomically. + +( Note: other than paravirt, there's little need for all this + complexity. We could check RIP instead of RSP. ) + +Signed-off-by: Andy Lutomirski +Reviewed-by: Steven Rostedt +Cc: Borislav Petkov +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: stable@vger.kernel.org +Signed-off-by: Ingo Molnar +[bwh: Backported to 4.0: adjust filename, context] +Signed-off-by: Ben Hutchings +--- + arch/x86/kernel/entry_64.S | 29 +++++++++++++++++++++++++---- + 1 file changed, 25 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -1650,7 +1650,14 @@ ENTRY(nmi) + /* + * Now test if the previous stack was an NMI stack. This covers + * the case where we interrupt an outer NMI after it clears +- * "NMI executing" but before IRET. ++ * "NMI executing" but before IRET. We need to be careful, though: ++ * there is one case in which RSP could point to the NMI stack ++ * despite there being no NMI active: naughty userspace controls ++ * RSP at the very beginning of the SYSCALL targets. We can ++ * pull a fast one on naughty userspace, though: we program ++ * SYSCALL to mask DF, so userspace cannot cause DF to be set ++ * if it controls the kernel's RSP. We set DF before we clear ++ * "NMI executing". + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ +@@ -1661,10 +1668,16 @@ ENTRY(nmi) + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi +- /* Ah, it is within the NMI stack, treat it as nested */ ++ ++ /* Ah, it is within the NMI stack. */ ++ ++ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) ++ jz first_nmi /* RSP was user controlled. */ + + CFI_REMEMBER_STATE + ++ /* This is a nested NMI. */ ++ + nested_nmi: + /* + * Modify the "iret" frame to point to repeat_nmi, forcing another +@@ -1776,8 +1789,16 @@ nmi_restore: + + RESTORE_ALL 6*8 + +- /* Clear "NMI executing". */ +- movq $0, 5*8(%rsp) ++ /* ++ * Clear "NMI executing". Set DF first so that we can easily ++ * distinguish the remaining code between here and IRET from ++ * the SYSCALL entry and exit paths. On a native kernel, we ++ * could just inspect RIP, but, on paravirt kernels, ++ * INTERRUPT_RETURN can translate into a jump into a ++ * hypercall page. ++ */ ++ std ++ movq $0, 5*8(%rsp) /* clear "NMI executing" */ + + /* + * INTERRUPT_RETURN reads the "iret" frame and exits the NMI diff --git a/debian/patches/series b/debian/patches/series index ab67a7b25..662046724 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -82,3 +82,12 @@ debian/udp-fix-abi-change-in-4.0.6.patch bugfix/mips/mips-normalise-code-flow-in-the-cpu-exception-handle.patch bugfix/mips/mips-correct-fp-isa-requirements.patch bugfix/x86/kvm-x86-fix-kvm_apic_has_events-to-check-for-null-po.patch +bugfix/x86/0001-x86-asm-entry-64-Fold-the-test_in_nmi-macro-into-its.patch +bugfix/x86/0002-x86-asm-entry-64-Remove-a-redundant-jump.patch +bugfix/x86/0003-x86-asm-entry-64-Remove-pointless-jump-to-irq_return.patch +bugfix/x86/0004-x86-nmi-Enable-nested-do_nmi-handling-for-64-bit-ker.patch +bugfix/x86/0005-x86-nmi-64-Remove-asm-code-that-saves-cr2.patch +bugfix/x86/0006-x86-nmi-64-Switch-stacks-on-userspace-NMI-entry.patch +bugfix/x86/0007-x86-nmi-64-Improve-nested-NMI-comments.patch +bugfix/x86/0008-x86-nmi-64-Reorder-nested-NMI-checks.patch +bugfix/x86/0009-x86-nmi-64-Use-DF-to-avoid-userspace-RSP-confusing-n.patch