diff --git a/debian/changelog b/debian/changelog index ebb606343..8626fbb29 100644 --- a/debian/changelog +++ b/debian/changelog @@ -73,6 +73,11 @@ linux (4.4.5-1) UNRELEASED; urgency=medium * [x86] drm/i915: Fix oops caused by fbdev initialization failure * module: Fix ABI change in 4.4.5 * Revert "libata: Align ata_device's id on a cacheline" to avoid ABI change + * [amd64] Fix more regressions due to "efi: Build our own page table + structure": + - efi: Fix boot crash by always mapping boot service regions into new EFI + page tables (Closes: #815125) + - mm/pat: Fix boot crash when 1GB pages are not supported by cpu [ Ian Campbell ] * [arm64] Enable ARCH_HISI (Hisilicon) and the set of currently available diff --git a/debian/patches/bugfix/x86/x86-efi-fix-boot-crash-by-always-mapping-boot-servic.patch b/debian/patches/bugfix/x86/x86-efi-fix-boot-crash-by-always-mapping-boot-servic.patch new file mode 100644 index 000000000..56e8485aa --- /dev/null +++ b/debian/patches/bugfix/x86/x86-efi-fix-boot-crash-by-always-mapping-boot-servic.patch @@ -0,0 +1,197 @@ +From: Matt Fleming +Date: Fri, 11 Mar 2016 11:19:23 +0000 +Subject: x86/efi: Fix boot crash by always mapping boot service regions into + new EFI page tables +Origin: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit?id=452308de61056a539352a9306c46716d7af8a1f1 +Bug-Debian: https://bugs.debian.org/815125 + +Some machines have EFI regions in page zero (physical address +0x00000000) and historically that region has been added to the e820 +map via trim_bios_range(), and ultimately mapped into the kernel page +tables. It was not mapped via efi_map_regions() as one would expect. + +Alexis reports that with the new separate EFI page tables some boot +services regions, such as page zero, are not mapped. This triggers an +oops during the SetVirtualAddressMap() runtime call. + +For the EFI boot services quirk on x86 we need to memblock_reserve() +boot services regions until after SetVirtualAddressMap(). Doing that +while respecting the ownership of regions that may have already been +reserved by the kernel was the motivation behind this commit: + + 7d68dc3f1003 ("x86, efi: Do not reserve boot services regions within reserved areas") + +That patch was merged at a time when the EFI runtime virtual mappings +were inserted into the kernel page tables as described above, and the +trick of setting ->numpages (and hence the region size) to zero to +track regions that should not be freed in efi_free_boot_services() +meant that we never mapped those regions in efi_map_regions(). Instead +we were relying solely on the existing kernel mappings. + +Now that we have separate page tables we need to make sure the EFI +boot services regions are mapped correctly, even if someone else has +already called memblock_reserve(). Instead of stashing a tag in +->numpages, set the EFI_MEMORY_RUNTIME bit of ->attribute. Since it +generally makes no sense to mark a boot services region as required at +runtime, it's pretty much guaranteed the firmware will not have +already set this bit. + +For the record, the specific circumstances under which Alexis +triggered this bug was that an EFI runtime driver on his machine was +responding to the EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE event during +SetVirtualAddressMap(). + +The event handler for this driver looks like this, + + sub rsp,0x28 + lea rdx,[rip+0x2445] # 0xaa948720 + mov ecx,0x4 + call func_aa9447c0 ; call to ConvertPointer(4, & 0xaa948720) + mov r11,QWORD PTR [rip+0x2434] # 0xaa948720 + xor eax,eax + mov BYTE PTR [r11+0x1],0x1 + add rsp,0x28 + ret + +Which is pretty typical code for an EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE +handler. The "mov r11, QWORD PTR [rip+0x2424]" was the faulting +instruction because ConvertPointer() was being called to convert the +address 0x0000000000000000, which when converted is left unchanged and +remains 0x0000000000000000. + +The output of the oops trace gave the impression of a standard NULL +pointer dereference bug, but because we're accessing physical +addresses during ConvertPointer(), it wasn't. EFI boot services code +is stored at that address on Alexis' machine. + +Reported-by: Alexis Murzeau +Signed-off-by: Matt Fleming +Cc: Andy Lutomirski +Cc: Ard Biesheuvel +Cc: Ben Hutchings +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Linus Torvalds +Cc: Maarten Lankhorst +Cc: Matthew Garrett +Cc: Peter Zijlstra +Cc: Raphael Hertzog +Cc: Roger Shimizu +Cc: Thomas Gleixner +Cc: linux-efi@vger.kernel.org +Link: http://lkml.kernel.org/r/1457695163-29632-2-git-send-email-matt@codeblueprint.co.uk +Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=815125 +Signed-off-by: Ingo Molnar +--- + arch/x86/platform/efi/quirks.c | 79 +++++++++++++++++++++++++++++++++--------- + 1 file changed, 62 insertions(+), 17 deletions(-) + +--- a/arch/x86/platform/efi/quirks.c ++++ b/arch/x86/platform/efi/quirks.c +@@ -130,6 +130,27 @@ efi_status_t efi_query_variable_store(u3 + EXPORT_SYMBOL_GPL(efi_query_variable_store); + + /* ++ * Helper function for efi_reserve_boot_services() to figure out if we ++ * can free regions in efi_free_boot_services(). ++ * ++ * Use this function to ensure we do not free regions owned by somebody ++ * else. We must only reserve (and then free) regions: ++ * ++ * - Not within any part of the kernel ++ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc) ++ */ ++static bool can_free_region(u64 start, u64 size) ++{ ++ if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end)) ++ return false; ++ ++ if (!e820_all_mapped(start, start+size, E820_RAM)) ++ return false; ++ ++ return true; ++} ++ ++/* + * The UEFI specification makes it clear that the operating system is free to do + * whatever it wants with boot services code after ExitBootServices() has been + * called. Ignoring this recommendation a significant bunch of EFI implementations +@@ -146,26 +167,50 @@ void __init efi_reserve_boot_services(vo + efi_memory_desc_t *md = p; + u64 start = md->phys_addr; + u64 size = md->num_pages << EFI_PAGE_SHIFT; ++ bool already_reserved; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; +- /* Only reserve where possible: +- * - Not within any already allocated areas +- * - Not over any memory area (really needed, if above?) +- * - Not within any part of the kernel +- * - Not the bios reserved area +- */ +- if ((start + size > __pa_symbol(_text) +- && start <= __pa_symbol(_end)) || +- !e820_all_mapped(start, start+size, E820_RAM) || +- memblock_is_region_reserved(start, size)) { +- /* Could not reserve, skip it */ +- md->num_pages = 0; +- memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", +- start, start+size-1); +- } else ++ ++ already_reserved = memblock_is_region_reserved(start, size); ++ ++ /* ++ * Because the following memblock_reserve() is paired ++ * with free_bootmem_late() for this region in ++ * efi_free_boot_services(), we must be extremely ++ * careful not to reserve, and subsequently free, ++ * critical regions of memory (like the kernel image) or ++ * those regions that somebody else has already ++ * reserved. ++ * ++ * A good example of a critical region that must not be ++ * freed is page zero (first 4Kb of memory), which may ++ * contain boot services code/data but is marked ++ * E820_RESERVED by trim_bios_range(). ++ */ ++ if (!already_reserved) { + memblock_reserve(start, size); ++ ++ /* ++ * If we are the first to reserve the region, no ++ * one else cares about it. We own it and can ++ * free it later. ++ */ ++ if (can_free_region(start, size)) ++ continue; ++ } ++ ++ /* ++ * We don't own the region. We must not free it. ++ * ++ * Setting this bit for a boot services region really ++ * doesn't make sense as far as the firmware is ++ * concerned, but it does provide us with a way to tag ++ * those regions that must not be paired with ++ * free_bootmem_late(). ++ */ ++ md->attribute |= EFI_MEMORY_RUNTIME; + } + } + +@@ -182,8 +227,8 @@ void __init efi_free_boot_services(void) + md->type != EFI_BOOT_SERVICES_DATA) + continue; + +- /* Could not reserve boot area */ +- if (!size) ++ /* Do not free, someone else owns it: */ ++ if (md->attribute & EFI_MEMORY_RUNTIME) + continue; + + free_bootmem_late(start, size); diff --git a/debian/patches/bugfix/x86/x86-mm-pat-fix-boot-crash-when-1gb-pages-are-not-supported.patch b/debian/patches/bugfix/x86/x86-mm-pat-fix-boot-crash-when-1gb-pages-are-not-supported.patch new file mode 100644 index 000000000..bae631eb3 --- /dev/null +++ b/debian/patches/bugfix/x86/x86-mm-pat-fix-boot-crash-when-1gb-pages-are-not-supported.patch @@ -0,0 +1,58 @@ +From: Matt Fleming +Date: Mon, 14 Mar 2016 10:33:01 +0000 +Subject: x86/mm/pat: Fix boot crash when 1GB pages are not supported by cpu +Origin: http://mid.gmane.org/1457951581-27353-2-git-send-email-matt@codeblueprint.co.uk + +Scott reports that with the new separate EFI page tables he's seeing +the following error on boot, caused by setting reserved bits in the +page table structures (fault code is PF_RSVD | PF_PROT), + + swapper/0: Corrupted page table at address 17b102020 + PGD 17b0e5063 PUD 1400000e3 + Bad pagetable: 0009 [#1] SMP + +On first inspection the PUD is using a 1GB page size (_PAGE_PSE) and +looks fine but that's only true if support for 1GB PUD pages +("pdpe1gb") is present in the cpu. + +Scott's Intel Celeron N2820 does not have that feature and so the +_PAGE_PSE bit is reserved. Fix this issue by making the 1GB mapping +code in conditional on "cpu_has_gbpages". + +This issue didn't come up in the past because the required mapping for +the faulting address (0x17b102020) will already have been setup by the +kernel in early boot before we got to efi_map_regions(), but we no +longer use the standard kernel page tables during EFI calls. + +Reported-by: Scott Ashcroft +Tested-by: Scott Ashcroft +Cc: Ard Biesheuvel +Cc: Ben Hutchings +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Denys Vlasenko +Cc: "H. Peter Anvin" +Cc: Linus Torvalds +Cc: Maarten Lankhorst +Cc: Matthew Garrett +Cc: Peter Zijlstra +Cc: Raphael Hertzog +Cc: Roger Shimizu +Cc: Thomas Gleixner +Cc: linux-efi@vger.kernel.org +Signed-off-by: Matt Fleming +--- + arch/x86/mm/pageattr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -1036,7 +1036,7 @@ static int populate_pud(struct cpa_data + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ +- while (end - start >= PUD_SIZE) { ++ while (cpu_has_gbpages && end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | + massage_pgprot(pud_pgprot))); + diff --git a/debian/patches/series b/debian/patches/series index 51d58160a..e4b136c10 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -137,3 +137,5 @@ bugfix/powerpc/powerpc-fix-dedotify-for-binutils-2.26.patch bugfix/x86/drm-i915-Fix-oops-caused-by-fbdev-initialization-fai.patch debian/revert-libata-align-ata_device-s-id-on-a-cacheline.patch debian/module-fix-abi-change-in-4.4.5.patch +bugfix/x86/x86-efi-fix-boot-crash-by-always-mapping-boot-servic.patch +bugfix/x86/x86-mm-pat-fix-boot-crash-when-1gb-pages-are-not-supported.patch