From 303c65230ff59932cf8e480ee89e8abd56c086f5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 1 Jun 2012 03:47:24 +0000 Subject: [PATCH] Add a bunch of security patches svn path=/dists/sid/linux-2.6/; revision=19050 --- debian/changelog | 4 + ...use-after-free-bug-in-quota-handling.patch | 451 ++++++++++++++++++ ...etlb-fix-resv_map-leak-in-error-path.patch | 95 ++++ .../mm-fix-vma_resv_map-null-pointer.patch | 66 +++ ...ae-pmd-walk-vs-pmd_populate-smp-race.patch | 214 +++++++++ debian/patches/series/base | 5 + 6 files changed, 835 insertions(+) create mode 100644 debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch create mode 100644 debian/patches/bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch create mode 100644 debian/patches/bugfix/all/mm-fix-vma_resv_map-null-pointer.patch create mode 100644 debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch diff --git a/debian/changelog b/debian/changelog index 95fcff5af..dac770940 100644 --- a/debian/changelog +++ b/debian/changelog @@ -27,6 +27,10 @@ linux-2.6 (3.2.19-1) UNRELEASED; urgency=low * [x86] ata_piix: defer disks to the Hyper-V drivers by default * [x86] drm/i915:: Disable FBC on SandyBridge (Closes: #675022) * AppArmor: compatibility patch for v5 interface (Closes: #661151) + * hugepages: fix use after free bug in "quota" handling (CVE-2012-2133) + * [x86] mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race + condition (CVE-2012-2373) + * hugetlb: fix resv_map leak in error path (CVE-2012-2390) -- Ben Hutchings Sun, 27 May 2012 01:12:44 +0100 diff --git a/debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch b/debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch new file mode 100644 index 000000000..2105d09cd --- /dev/null +++ b/debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch @@ -0,0 +1,451 @@ +From: David Gibson +Date: Wed, 21 Mar 2012 16:34:12 -0700 +Subject: [PATCH] hugepages: fix use after free bug in "quota" handling + +commit 90481622d75715bfcb68501280a917dbfe516029 upstream. + +hugetlbfs_{get,put}_quota() are badly named. They don't interact with the +general quota handling code, and they don't much resemble its behaviour. +Rather than being about maintaining limits on on-disk block usage by +particular users, they are instead about maintaining limits on in-memory +page usage (including anonymous MAP_PRIVATE copied-on-write pages) +associated with a particular hugetlbfs filesystem instance. + +Worse, they work by having callbacks to the hugetlbfs filesystem code from +the low-level page handling code, in particular from free_huge_page(). +This is a layering violation of itself, but more importantly, if the +kernel does a get_user_pages() on hugepages (which can happen from KVM +amongst others), then the free_huge_page() can be delayed until after the +associated inode has already been freed. If an unmount occurs at the +wrong time, even the hugetlbfs superblock where the "quota" limits are +stored may have been freed. + +Andrew Barry proposed a patch to fix this by having hugepages, instead of +storing a pointer to their address_space and reaching the superblock from +there, had the hugepages store pointers directly to the superblock, +bumping the reference count as appropriate to avoid it being freed. +Andrew Morton rejected that version, however, on the grounds that it made +the existing layering violation worse. + +This is a reworked version of Andrew's patch, which removes the extra, and +some of the existing, layering violation. It works by introducing the +concept of a hugepage "subpool" at the lower hugepage mm layer - that is a +finite logical pool of hugepages to allocate from. hugetlbfs now creates +a subpool for each filesystem instance with a page limit set, and a +pointer to the subpool gets added to each allocated hugepage, instead of +the address_space pointer used now. The subpool has its own lifetime and +is only freed once all pages in it _and_ all other references to it (i.e. +superblocks) are gone. + +subpools are optional - a NULL subpool pointer is taken by the code to +mean that no subpool limits are in effect. + +Previous discussion of this bug found in: "Fix refcounting in hugetlbfs +quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or +http://marc.info/?l=linux-mm&m=126928970510627&w=1 + +v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to +alloc_huge_page() - since it already takes the vma, it is not necessary. + +Signed-off-by: Andrew Barry +Signed-off-by: David Gibson +Cc: Hugh Dickins +Cc: Mel Gorman +Cc: Minchan Kim +Cc: Hillf Danton +Cc: Paul Mackerras +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Backported to 3.2: adjust context to apply after commit + c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in + error path' which should be in 3.2.20] +--- +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) + spin_lock(&sbinfo->stat_lock); + /* If no limits set, just report 0 for max/free/used + * blocks, like simple_statfs() */ +- if (sbinfo->max_blocks >= 0) { +- buf->f_blocks = sbinfo->max_blocks; +- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; ++ if (sbinfo->spool) { ++ long free_pages; ++ ++ spin_lock(&sbinfo->spool->lock); ++ buf->f_blocks = sbinfo->spool->max_hpages; ++ free_pages = sbinfo->spool->max_hpages ++ - sbinfo->spool->used_hpages; ++ buf->f_bavail = buf->f_bfree = free_pages; ++ spin_unlock(&sbinfo->spool->lock); + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } +@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb) + + if (sbi) { + sb->s_fs_info = NULL; ++ ++ if (sbi->spool) ++ hugepage_put_subpool(sbi->spool); ++ + kfree(sbi); + } + } +@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) + sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; + spin_lock_init(&sbinfo->stat_lock); +- sbinfo->max_blocks = config.nr_blocks; +- sbinfo->free_blocks = config.nr_blocks; + sbinfo->max_inodes = config.nr_inodes; + sbinfo->free_inodes = config.nr_inodes; ++ sbinfo->spool = NULL; ++ if (config.nr_blocks != -1) { ++ sbinfo->spool = hugepage_new_subpool(config.nr_blocks); ++ if (!sbinfo->spool) ++ goto out_free; ++ } + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); +@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) + sb->s_root = root; + return 0; + out_free: ++ if (sbinfo->spool) ++ kfree(sbinfo->spool); + kfree(sbinfo); + return -ENOMEM; + } + +-int hugetlb_get_quota(struct address_space *mapping, long delta) +-{ +- int ret = 0; +- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); +- +- if (sbinfo->free_blocks > -1) { +- spin_lock(&sbinfo->stat_lock); +- if (sbinfo->free_blocks - delta >= 0) +- sbinfo->free_blocks -= delta; +- else +- ret = -ENOMEM; +- spin_unlock(&sbinfo->stat_lock); +- } +- +- return ret; +-} +- +-void hugetlb_put_quota(struct address_space *mapping, long delta) +-{ +- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); +- +- if (sbinfo->free_blocks > -1) { +- spin_lock(&sbinfo->stat_lock); +- sbinfo->free_blocks += delta; +- spin_unlock(&sbinfo->stat_lock); +- } +-} +- + static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) + { +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index 7adc492..cf01817 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -14,6 +14,15 @@ struct user_struct; + #include + #include + ++struct hugepage_subpool { ++ spinlock_t lock; ++ long count; ++ long max_hpages, used_hpages; ++}; ++ ++struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); ++void hugepage_put_subpool(struct hugepage_subpool *spool); ++ + int PageHuge(struct page *page); + + void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +@@ -129,12 +138,11 @@ enum { + }; + + struct hugetlbfs_sb_info { +- long max_blocks; /* blocks allowed */ +- long free_blocks; /* blocks free */ + long max_inodes; /* inodes allowed */ + long free_inodes; /* inodes free */ + spinlock_t stat_lock; + struct hstate *hstate; ++ struct hugepage_subpool *spool; + }; + + +@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations; + extern const struct vm_operations_struct hugetlb_vm_ops; + struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, + struct user_struct **user, int creat_flags); +-int hugetlb_get_quota(struct address_space *mapping, long delta); +-void hugetlb_put_quota(struct address_space *mapping, long delta); + + static inline int is_file_hugepages(struct file *file) + { +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index b1c3148..afa057a 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; + */ + static DEFINE_SPINLOCK(hugetlb_lock); + ++static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) ++{ ++ bool free = (spool->count == 0) && (spool->used_hpages == 0); ++ ++ spin_unlock(&spool->lock); ++ ++ /* If no pages are used, and no other handles to the subpool ++ * remain, free the subpool the subpool remain */ ++ if (free) ++ kfree(spool); ++} ++ ++struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) ++{ ++ struct hugepage_subpool *spool; ++ ++ spool = kmalloc(sizeof(*spool), GFP_KERNEL); ++ if (!spool) ++ return NULL; ++ ++ spin_lock_init(&spool->lock); ++ spool->count = 1; ++ spool->max_hpages = nr_blocks; ++ spool->used_hpages = 0; ++ ++ return spool; ++} ++ ++void hugepage_put_subpool(struct hugepage_subpool *spool) ++{ ++ spin_lock(&spool->lock); ++ BUG_ON(!spool->count); ++ spool->count--; ++ unlock_or_release_subpool(spool); ++} ++ ++static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, ++ long delta) ++{ ++ int ret = 0; ++ ++ if (!spool) ++ return 0; ++ ++ spin_lock(&spool->lock); ++ if ((spool->used_hpages + delta) <= spool->max_hpages) { ++ spool->used_hpages += delta; ++ } else { ++ ret = -ENOMEM; ++ } ++ spin_unlock(&spool->lock); ++ ++ return ret; ++} ++ ++static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, ++ long delta) ++{ ++ if (!spool) ++ return; ++ ++ spin_lock(&spool->lock); ++ spool->used_hpages -= delta; ++ /* If hugetlbfs_put_super couldn't free spool due to ++ * an outstanding quota reference, free it now. */ ++ unlock_or_release_subpool(spool); ++} ++ ++static inline struct hugepage_subpool *subpool_inode(struct inode *inode) ++{ ++ return HUGETLBFS_SB(inode->i_sb)->spool; ++} ++ ++static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) ++{ ++ return subpool_inode(vma->vm_file->f_dentry->d_inode); ++} ++ + /* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. +@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page) + */ + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); +- struct address_space *mapping; ++ struct hugepage_subpool *spool = ++ (struct hugepage_subpool *)page_private(page); + +- mapping = (struct address_space *) page_private(page); + set_page_private(page, 0); + page->mapping = NULL; + BUG_ON(page_count(page)); +@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page) + enqueue_huge_page(h, page); + } + spin_unlock(&hugetlb_lock); +- if (mapping) +- hugetlb_put_quota(mapping, 1); ++ hugepage_subpool_put_pages(spool, 1); + } + + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, + /* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase +- * reservation and actually increase quota before an allocation can occur. +- * Where any new reservation would be required the reservation change is +- * prepared, but not committed. Once the page has been quota'd allocated +- * an instantiated the change should be committed via vma_commit_reservation. +- * No action is required on failure. ++ * reservation and actually increase subpool usage before an allocation ++ * can occur. Where any new reservation would be required the ++ * reservation change is prepared, but not committed. Once the page ++ * has been allocated from the subpool and instantiated the change should ++ * be committed via vma_commit_reservation. No action is required on ++ * failure. + */ + static long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, + static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) + { ++ struct hugepage_subpool *spool = subpool_vma(vma); + struct hstate *h = hstate_vma(vma); + struct page *page; +- struct address_space *mapping = vma->vm_file->f_mapping; +- struct inode *inode = mapping->host; + long chg; + + /* +- * Processes that did not create the mapping will have no reserves and +- * will not have accounted against quota. Check that the quota can be +- * made before satisfying the allocation +- * MAP_NORESERVE mappings may also need pages and quota allocated +- * if no reserve mapping overlaps. ++ * Processes that did not create the mapping will have no ++ * reserves and will not have accounted against subpool ++ * limit. Check that the subpool limit can be made before ++ * satisfying the allocation MAP_NORESERVE mappings may also ++ * need pages and subpool limit allocated allocated if no reserve ++ * mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(-VM_FAULT_OOM); + if (chg) +- if (hugetlb_get_quota(inode->i_mapping, chg)) ++ if (hugepage_subpool_get_pages(spool, chg)) + return ERR_PTR(-VM_FAULT_SIGBUS); + + spin_lock(&hugetlb_lock); +@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, + if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) { +- hugetlb_put_quota(inode->i_mapping, chg); ++ hugepage_subpool_put_pages(spool, chg); + return ERR_PTR(-VM_FAULT_SIGBUS); + } + } + +- set_page_private(page, (unsigned long) mapping); ++ set_page_private(page, (unsigned long)spool); + + vma_commit_reservation(h, vma, addr); + +@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) + { + struct hstate *h = hstate_vma(vma); + struct resv_map *reservations = vma_resv_map(vma); ++ struct hugepage_subpool *spool = subpool_vma(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; +@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) + + if (reserve) { + hugetlb_acct_memory(h, -reserve); +- hugetlb_put_quota(vma->vm_file->f_mapping, reserve); ++ hugepage_subpool_put_pages(spool, reserve); + } + } + } +@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + address = address & huge_page_mask(h); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); +- mapping = (struct address_space *)page_private(page); ++ mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + + /* + * Take the mapping lock for the duration of the table walk. As +@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode, + { + long ret, chg; + struct hstate *h = hstate_inode(inode); ++ struct hugepage_subpool *spool = subpool_inode(inode); + + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page +- * and filesystem quota without using reserves ++ * without using reserves + */ + if (vm_flags & VM_NORESERVE) + return 0; +@@ -2915,19 +2995,19 @@ int hugetlb_reserve_pages(struct inode *inode, + goto out_err; + } + +- /* There must be enough filesystem quota for the mapping */ +- if (hugetlb_get_quota(inode->i_mapping, chg)) { ++ /* There must be enough pages in the subpool for the mapping */ ++ if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } + + /* + * Check enough hugepages are available for the reservation. +- * Hand back the quota if there are not ++ * Hand the pages back to the subpool if there are not + */ + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { +- hugetlb_put_quota(inode->i_mapping, chg); ++ hugepage_subpool_put_pages(spool, chg); + goto out_err; + } + +@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) + { + struct hstate *h = hstate_inode(inode); + long chg = region_truncate(&inode->i_mapping->private_list, offset); ++ struct hugepage_subpool *spool = subpool_inode(inode); + + spin_lock(&inode->i_lock); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); + spin_unlock(&inode->i_lock); + +- hugetlb_put_quota(inode->i_mapping, (chg - freed)); ++ hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); + } + diff --git a/debian/patches/bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch b/debian/patches/bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch new file mode 100644 index 000000000..2ea5e1c81 --- /dev/null +++ b/debian/patches/bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch @@ -0,0 +1,95 @@ +From: Dave Hansen +Date: Fri, 18 May 2012 11:46:30 -0700 +Subject: hugetlb: fix resv_map leak in error path + +commit c50ac050811d6485616a193eb0f37bfbd191cc89 upstream. + +When called for anonymous (non-shared) mappings, hugetlb_reserve_pages() +does a resv_map_alloc(). It depends on code in hugetlbfs's +vm_ops->close() to release that allocation. + +However, in the mmap() failure path, we do a plain unmap_region() without +the remove_vma() which actually calls vm_ops->close(). + +This is a decent fix. This leak could get reintroduced if new code (say, +after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return +an error. But, I think it would have to unroll the reservation anyway. + +Christoph's test case: + + http://marc.info/?l=linux-mm&m=133728900729735 + +Signed-off-by: Dave Hansen +[Christoph Lameter: I have rediffed the patch against 2.6.32 and 3.2.0.] +Signed-off-by: Ben Hutchings +--- + mm/hugetlb.c | 28 ++++++++++++++++++++++------ + 1 file changed, 22 insertions(+), 6 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2068,6 +2068,15 @@ + kref_get(&reservations->refs); + } + ++static void resv_map_put(struct vm_area_struct *vma) ++{ ++ struct resv_map *reservations = vma_resv_map(vma); ++ ++ if (!reservations) ++ return; ++ kref_put(&reservations->refs, resv_map_release); ++} ++ + static void hugetlb_vm_op_close(struct vm_area_struct *vma) + { + struct hstate *h = hstate_vma(vma); +@@ -2083,7 +2092,7 @@ + reserve = (end - start) - + region_count(&reservations->regions, start, end); + +- kref_put(&reservations->refs, resv_map_release); ++ resv_map_put(vma); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); +@@ -2884,12 +2893,16 @@ + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + +- if (chg < 0) +- return chg; ++ if (chg < 0) { ++ ret = chg; ++ goto out_err; ++ } + + /* There must be enough filesystem quota for the mapping */ +- if (hugetlb_get_quota(inode->i_mapping, chg)) +- return -ENOSPC; ++ if (hugetlb_get_quota(inode->i_mapping, chg)) { ++ ret = -ENOSPC; ++ goto out_err; ++ } + + /* + * Check enough hugepages are available for the reservation. +@@ -2898,7 +2911,7 @@ + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { + hugetlb_put_quota(inode->i_mapping, chg); +- return ret; ++ goto out_err; + } + + /* +@@ -2915,6 +2928,9 @@ + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_add(&inode->i_mapping->private_list, from, to); + return 0; ++out_err: ++ resv_map_put(vma); ++ return ret; + } + + void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) diff --git a/debian/patches/bugfix/all/mm-fix-vma_resv_map-null-pointer.patch b/debian/patches/bugfix/all/mm-fix-vma_resv_map-null-pointer.patch new file mode 100644 index 000000000..d6c0c3974 --- /dev/null +++ b/debian/patches/bugfix/all/mm-fix-vma_resv_map-null-pointer.patch @@ -0,0 +1,66 @@ +From: Dave Hansen +Date: Wed, 30 May 2012 07:51:07 -0700 +Subject: mm: fix vma_resv_map() NULL pointer + +commit 4523e1458566a0e8ecfaff90f380dd23acc44d27 upstream. + +hugetlb_reserve_pages() can be used for either normal file-backed +hugetlbfs mappings, or MAP_HUGETLB. In the MAP_HUGETLB, semi-anonymous +mode, there is not a VMA around. The new call to resv_map_put() assumed +that there was, and resulted in a NULL pointer dereference: + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 + IP: vma_resv_map+0x9/0x30 + PGD 141453067 PUD 1421e1067 PMD 0 + Oops: 0000 [#1] PREEMPT SMP + ... + Pid: 14006, comm: trinity-child6 Not tainted 3.4.0+ #36 + RIP: vma_resv_map+0x9/0x30 + ... + Process trinity-child6 (pid: 14006, threadinfo ffff8801414e0000, task ffff8801414f26b0) + Call Trace: + resv_map_put+0xe/0x40 + hugetlb_reserve_pages+0xa6/0x1d0 + hugetlb_file_setup+0x102/0x2c0 + newseg+0x115/0x360 + ipcget+0x1ce/0x310 + sys_shmget+0x5a/0x60 + system_call_fastpath+0x16/0x1b + +This was reported by Dave Jones, but was reproducible with the +libhugetlbfs test cases, so shame on me for not running them in the +first place. + +With this, the oops is gone, and the output of libhugetlbfs's +run_tests.py is identical to plain 3.4 again. + +[ Marked for stable, since this was introduced by commit c50ac050811d + ("hugetlb: fix resv_map leak in error path") which was also marked for + stable ] + +Reported-by: Dave Jones +Cc: Mel Gorman +Cc: KOSAKI Motohiro +Cc: Christoph Lameter +Cc: Andrea Arcangeli +Cc: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +--- + mm/hugetlb.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 285a81e..e198831 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3036,7 +3036,8 @@ int hugetlb_reserve_pages(struct inode *inode, + region_add(&inode->i_mapping->private_list, from, to); + return 0; + out_err: +- resv_map_put(vma); ++ if (vma) ++ resv_map_put(vma); + return ret; + } + diff --git a/debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch b/debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch new file mode 100644 index 000000000..1b2f44156 --- /dev/null +++ b/debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch @@ -0,0 +1,214 @@ +From: Andrea Arcangeli +Date: Tue, 29 May 2012 15:06:49 -0700 +Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race + condition + +commit 26c191788f18129af0eb32a358cdaea0c7479626 upstream. + +When holding the mmap_sem for reading, pmd_offset_map_lock should only +run on a pmd_t that has been read atomically from the pmdp pointer, +otherwise we may read only half of it leading to this crash. + +PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic" + #0 [f06a9dd8] crash_kexec at c049b5ec + #1 [f06a9e2c] oops_end at c083d1c2 + #2 [f06a9e40] no_context at c0433ded + #3 [f06a9e64] bad_area_nosemaphore at c043401a + #4 [f06a9e6c] __do_page_fault at c0434493 + #5 [f06a9eec] do_page_fault at c083eb45 + #6 [f06a9f04] error_code (via page_fault) at c083c5d5 + EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP: + 00000000 + DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0 + CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246 + #7 [f06a9f38] _spin_lock at c083bc14 + #8 [f06a9f44] sys_mincore at c0507b7d + #9 [f06a9fb0] system_call at c083becd + start len + EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f + DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00 + SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033 + CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286 + +This should be a longstanding bug affecting x86 32bit PAE without THP. +Only archs with 64bit large pmd_t and 32bit unsigned long should be +affected. + +With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad() +would partly hide the bug when the pmd transition from none to stable, +by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is +enabled a new set of problem arises by the fact could then transition +freely in any of the none, pmd_trans_huge or pmd_trans_stable states. +So making the barrier in pmd_none_or_trans_huge_or_clear_bad() +unconditional isn't good idea and it would be a flakey solution. + +This should be fully fixed by introducing a pmd_read_atomic that reads +the pmd in order with THP disabled, or by reading the pmd atomically +with cmpxchg8b with THP enabled. + +Luckily this new race condition only triggers in the places that must +already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix +is localized there but this bug is not related to THP. + +NOTE: this can trigger on x86 32bit systems with PAE enabled with more +than 4G of ram, otherwise the high part of the pmd will never risk to be +truncated because it would be zero at all times, in turn so hiding the +SMP race. + +This bug was discovered and fully debugged by Ulrich, quote: + +---- +[..] +pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and +eax. + + 496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t + *pmd) + 497 { + 498 /* depend on compiler for an atomic pmd read */ + 499 pmd_t pmdval = *pmd; + + // edi = pmd pointer +0xc0507a74 : mov 0x8(%esp),%edi +... + // edx = PTE page table high address +0xc0507a84 : mov 0x4(%edi),%edx +... + // eax = PTE page table low address +0xc0507a8e : mov (%edi),%eax + +[..] + +Please note that the PMD is not read atomically. These are two "mov" +instructions where the high order bits of the PMD entry are fetched +first. Hence, the above machine code is prone to the following race. + +- The PMD entry {high|low} is 0x0000000000000000. + The "mov" at 0xc0507a84 loads 0x00000000 into edx. + +- A page fault (on another CPU) sneaks in between the two "mov" + instructions and instantiates the PMD. + +- The PMD entry {high|low} is now 0x00000003fda38067. + The "mov" at 0xc0507a8e loads 0xfda38067 into eax. +---- + +Reported-by: Ulrich Obergfell +Signed-off-by: Andrea Arcangeli +Cc: Mel Gorman +Cc: Hugh Dickins +Cc: Larry Woodman +Cc: Petr Matousek +Cc: Rik van Riel +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Ben Hutchings +--- + arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++ + include/asm-generic/pgtable.h | 22 +++++++++++++-- + 2 files changed, 70 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h +index effff47..43876f1 100644 +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) + ptep->pte_low = pte.pte_low; + } + ++#define pmd_read_atomic pmd_read_atomic ++/* ++ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with ++ * a "*pmdp" dereference done by gcc. Problem is, in certain places ++ * where pte_offset_map_lock is called, concurrent page faults are ++ * allowed, if the mmap_sem is hold for reading. An example is mincore ++ * vs page faults vs MADV_DONTNEED. On the page fault side ++ * pmd_populate rightfully does a set_64bit, but if we're reading the ++ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen ++ * because gcc will not read the 64bit of the pmd atomically. To fix ++ * this all places running pmd_offset_map_lock() while holding the ++ * mmap_sem in read mode, shall read the pmdp pointer using this ++ * function to know if the pmd is null nor not, and in turn to know if ++ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd ++ * operations. ++ * ++ * Without THP if the mmap_sem is hold for reading, the ++ * pmd can only transition from null to not null while pmd_read_atomic runs. ++ * So there's no need of literally reading it atomically. ++ * ++ * With THP if the mmap_sem is hold for reading, the pmd can become ++ * THP or null or point to a pte (and in turn become "stable") at any ++ * time under pmd_read_atomic, so it's mandatory to read it atomically ++ * with cmpxchg8b. ++ */ ++#ifndef CONFIG_TRANSPARENT_HUGEPAGE ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ pmdval_t ret; ++ u32 *tmp = (u32 *)pmdp; ++ ++ ret = (pmdval_t) (*tmp); ++ if (ret) { ++ /* ++ * If the low part is null, we must not read the high part ++ * or we can end up with a partial pmd. ++ */ ++ smp_rmb(); ++ ret |= ((pmdval_t)*(tmp + 1)) << 32; ++ } ++ ++ return (pmd_t) { ret }; ++} ++#else /* CONFIG_TRANSPARENT_HUGEPAGE */ ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; ++} ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) + { + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +index e2768f1..6f2b45a 100644 +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd) + #endif /* __HAVE_ARCH_PMD_WRITE */ + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + ++#ifndef pmd_read_atomic ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ /* ++ * Depend on compiler for an atomic pmd read. NOTE: this is ++ * only going to work, if the pmdval_t isn't larger than ++ * an unsigned long. ++ */ ++ return *pmdp; ++} ++#endif ++ + /* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and +@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd) + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. ++ * ++ * For 32bit kernels with a 64bit large pmd_t this automatically takes ++ * care of reading the pmd atomically to avoid SMP race conditions ++ * against pmd_populate() when the mmap_sem is hold for reading by the ++ * caller (a special atomic read not done by "gcc" as in the generic ++ * version above, is also needed when THP is disabled because the page ++ * fault can populate the pmd from under us). + */ + static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) + { +- /* depend on compiler for an atomic pmd read */ +- pmd_t pmdval = *pmd; ++ pmd_t pmdval = pmd_read_atomic(pmd); + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. diff --git a/debian/patches/series/base b/debian/patches/series/base index ff71984dd..a2c340fc9 100644 --- a/debian/patches/series/base +++ b/debian/patches/series/base @@ -302,3 +302,8 @@ # AppArmor userland compatibility. This had better be gone in wheezy+1! + features/all/AppArmor-compatibility-patch-for-v5-interface.patch + ++ bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch ++ bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch ++ bugfix/all/mm-fix-vma_resv_map-null-pointer.patch ++ bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch