Add a bunch of security patches
svn path=/dists/sid/linux-2.6/; revision=19050
This commit is contained in:
parent
d5d155747a
commit
303c65230f
|
@ -27,6 +27,10 @@ linux-2.6 (3.2.19-1) UNRELEASED; urgency=low
|
|||
* [x86] ata_piix: defer disks to the Hyper-V drivers by default
|
||||
* [x86] drm/i915:: Disable FBC on SandyBridge (Closes: #675022)
|
||||
* AppArmor: compatibility patch for v5 interface (Closes: #661151)
|
||||
* hugepages: fix use after free bug in "quota" handling (CVE-2012-2133)
|
||||
* [x86] mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race
|
||||
condition (CVE-2012-2373)
|
||||
* hugetlb: fix resv_map leak in error path (CVE-2012-2390)
|
||||
|
||||
-- Ben Hutchings <ben@decadent.org.uk> Sun, 27 May 2012 01:12:44 +0100
|
||||
|
||||
|
|
451
debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch
vendored
Normal file
451
debian/patches/bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch
vendored
Normal file
|
@ -0,0 +1,451 @@
|
|||
From: David Gibson <david@gibson.dropbear.id.au>
|
||||
Date: Wed, 21 Mar 2012 16:34:12 -0700
|
||||
Subject: [PATCH] hugepages: fix use after free bug in "quota" handling
|
||||
|
||||
commit 90481622d75715bfcb68501280a917dbfe516029 upstream.
|
||||
|
||||
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
|
||||
general quota handling code, and they don't much resemble its behaviour.
|
||||
Rather than being about maintaining limits on on-disk block usage by
|
||||
particular users, they are instead about maintaining limits on in-memory
|
||||
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
|
||||
associated with a particular hugetlbfs filesystem instance.
|
||||
|
||||
Worse, they work by having callbacks to the hugetlbfs filesystem code from
|
||||
the low-level page handling code, in particular from free_huge_page().
|
||||
This is a layering violation of itself, but more importantly, if the
|
||||
kernel does a get_user_pages() on hugepages (which can happen from KVM
|
||||
amongst others), then the free_huge_page() can be delayed until after the
|
||||
associated inode has already been freed. If an unmount occurs at the
|
||||
wrong time, even the hugetlbfs superblock where the "quota" limits are
|
||||
stored may have been freed.
|
||||
|
||||
Andrew Barry proposed a patch to fix this by having hugepages, instead of
|
||||
storing a pointer to their address_space and reaching the superblock from
|
||||
there, had the hugepages store pointers directly to the superblock,
|
||||
bumping the reference count as appropriate to avoid it being freed.
|
||||
Andrew Morton rejected that version, however, on the grounds that it made
|
||||
the existing layering violation worse.
|
||||
|
||||
This is a reworked version of Andrew's patch, which removes the extra, and
|
||||
some of the existing, layering violation. It works by introducing the
|
||||
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
|
||||
finite logical pool of hugepages to allocate from. hugetlbfs now creates
|
||||
a subpool for each filesystem instance with a page limit set, and a
|
||||
pointer to the subpool gets added to each allocated hugepage, instead of
|
||||
the address_space pointer used now. The subpool has its own lifetime and
|
||||
is only freed once all pages in it _and_ all other references to it (i.e.
|
||||
superblocks) are gone.
|
||||
|
||||
subpools are optional - a NULL subpool pointer is taken by the code to
|
||||
mean that no subpool limits are in effect.
|
||||
|
||||
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
|
||||
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
|
||||
http://marc.info/?l=linux-mm&m=126928970510627&w=1
|
||||
|
||||
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
|
||||
alloc_huge_page() - since it already takes the vma, it is not necessary.
|
||||
|
||||
Signed-off-by: Andrew Barry <abarry@cray.com>
|
||||
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
||||
Cc: Hugh Dickins <hughd@google.com>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Minchan Kim <minchan.kim@gmail.com>
|
||||
Cc: Hillf Danton <dhillf@gmail.com>
|
||||
Cc: Paul Mackerras <paulus@samba.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
[bwh: Backported to 3.2: adjust context to apply after commit
|
||||
c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in
|
||||
error path' which should be in 3.2.20]
|
||||
---
|
||||
--- a/fs/hugetlbfs/inode.c
|
||||
+++ b/fs/hugetlbfs/inode.c
|
||||
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
||||
spin_lock(&sbinfo->stat_lock);
|
||||
/* If no limits set, just report 0 for max/free/used
|
||||
* blocks, like simple_statfs() */
|
||||
- if (sbinfo->max_blocks >= 0) {
|
||||
- buf->f_blocks = sbinfo->max_blocks;
|
||||
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
|
||||
+ if (sbinfo->spool) {
|
||||
+ long free_pages;
|
||||
+
|
||||
+ spin_lock(&sbinfo->spool->lock);
|
||||
+ buf->f_blocks = sbinfo->spool->max_hpages;
|
||||
+ free_pages = sbinfo->spool->max_hpages
|
||||
+ - sbinfo->spool->used_hpages;
|
||||
+ buf->f_bavail = buf->f_bfree = free_pages;
|
||||
+ spin_unlock(&sbinfo->spool->lock);
|
||||
buf->f_files = sbinfo->max_inodes;
|
||||
buf->f_ffree = sbinfo->free_inodes;
|
||||
}
|
||||
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
|
||||
|
||||
if (sbi) {
|
||||
sb->s_fs_info = NULL;
|
||||
+
|
||||
+ if (sbi->spool)
|
||||
+ hugepage_put_subpool(sbi->spool);
|
||||
+
|
||||
kfree(sbi);
|
||||
}
|
||||
}
|
||||
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_fs_info = sbinfo;
|
||||
sbinfo->hstate = config.hstate;
|
||||
spin_lock_init(&sbinfo->stat_lock);
|
||||
- sbinfo->max_blocks = config.nr_blocks;
|
||||
- sbinfo->free_blocks = config.nr_blocks;
|
||||
sbinfo->max_inodes = config.nr_inodes;
|
||||
sbinfo->free_inodes = config.nr_inodes;
|
||||
+ sbinfo->spool = NULL;
|
||||
+ if (config.nr_blocks != -1) {
|
||||
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
|
||||
+ if (!sbinfo->spool)
|
||||
+ goto out_free;
|
||||
+ }
|
||||
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
||||
sb->s_blocksize = huge_page_size(config.hstate);
|
||||
sb->s_blocksize_bits = huge_page_shift(config.hstate);
|
||||
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_root = root;
|
||||
return 0;
|
||||
out_free:
|
||||
+ if (sbinfo->spool)
|
||||
+ kfree(sbinfo->spool);
|
||||
kfree(sbinfo);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
-int hugetlb_get_quota(struct address_space *mapping, long delta)
|
||||
-{
|
||||
- int ret = 0;
|
||||
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
|
||||
-
|
||||
- if (sbinfo->free_blocks > -1) {
|
||||
- spin_lock(&sbinfo->stat_lock);
|
||||
- if (sbinfo->free_blocks - delta >= 0)
|
||||
- sbinfo->free_blocks -= delta;
|
||||
- else
|
||||
- ret = -ENOMEM;
|
||||
- spin_unlock(&sbinfo->stat_lock);
|
||||
- }
|
||||
-
|
||||
- return ret;
|
||||
-}
|
||||
-
|
||||
-void hugetlb_put_quota(struct address_space *mapping, long delta)
|
||||
-{
|
||||
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
|
||||
-
|
||||
- if (sbinfo->free_blocks > -1) {
|
||||
- spin_lock(&sbinfo->stat_lock);
|
||||
- sbinfo->free_blocks += delta;
|
||||
- spin_unlock(&sbinfo->stat_lock);
|
||||
- }
|
||||
-}
|
||||
-
|
||||
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data)
|
||||
{
|
||||
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
|
||||
index 7adc492..cf01817 100644
|
||||
--- a/include/linux/hugetlb.h
|
||||
+++ b/include/linux/hugetlb.h
|
||||
@@ -14,6 +14,15 @@ struct user_struct;
|
||||
#include <linux/shm.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
+struct hugepage_subpool {
|
||||
+ spinlock_t lock;
|
||||
+ long count;
|
||||
+ long max_hpages, used_hpages;
|
||||
+};
|
||||
+
|
||||
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
|
||||
+void hugepage_put_subpool(struct hugepage_subpool *spool);
|
||||
+
|
||||
int PageHuge(struct page *page);
|
||||
|
||||
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
|
||||
@@ -129,12 +138,11 @@ enum {
|
||||
};
|
||||
|
||||
struct hugetlbfs_sb_info {
|
||||
- long max_blocks; /* blocks allowed */
|
||||
- long free_blocks; /* blocks free */
|
||||
long max_inodes; /* inodes allowed */
|
||||
long free_inodes; /* inodes free */
|
||||
spinlock_t stat_lock;
|
||||
struct hstate *hstate;
|
||||
+ struct hugepage_subpool *spool;
|
||||
};
|
||||
|
||||
|
||||
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
|
||||
extern const struct vm_operations_struct hugetlb_vm_ops;
|
||||
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
|
||||
struct user_struct **user, int creat_flags);
|
||||
-int hugetlb_get_quota(struct address_space *mapping, long delta);
|
||||
-void hugetlb_put_quota(struct address_space *mapping, long delta);
|
||||
|
||||
static inline int is_file_hugepages(struct file *file)
|
||||
{
|
||||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
|
||||
index b1c3148..afa057a 100644
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
|
||||
*/
|
||||
static DEFINE_SPINLOCK(hugetlb_lock);
|
||||
|
||||
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
|
||||
+{
|
||||
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
|
||||
+
|
||||
+ spin_unlock(&spool->lock);
|
||||
+
|
||||
+ /* If no pages are used, and no other handles to the subpool
|
||||
+ * remain, free the subpool the subpool remain */
|
||||
+ if (free)
|
||||
+ kfree(spool);
|
||||
+}
|
||||
+
|
||||
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
|
||||
+{
|
||||
+ struct hugepage_subpool *spool;
|
||||
+
|
||||
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
|
||||
+ if (!spool)
|
||||
+ return NULL;
|
||||
+
|
||||
+ spin_lock_init(&spool->lock);
|
||||
+ spool->count = 1;
|
||||
+ spool->max_hpages = nr_blocks;
|
||||
+ spool->used_hpages = 0;
|
||||
+
|
||||
+ return spool;
|
||||
+}
|
||||
+
|
||||
+void hugepage_put_subpool(struct hugepage_subpool *spool)
|
||||
+{
|
||||
+ spin_lock(&spool->lock);
|
||||
+ BUG_ON(!spool->count);
|
||||
+ spool->count--;
|
||||
+ unlock_or_release_subpool(spool);
|
||||
+}
|
||||
+
|
||||
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
|
||||
+ long delta)
|
||||
+{
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ if (!spool)
|
||||
+ return 0;
|
||||
+
|
||||
+ spin_lock(&spool->lock);
|
||||
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
|
||||
+ spool->used_hpages += delta;
|
||||
+ } else {
|
||||
+ ret = -ENOMEM;
|
||||
+ }
|
||||
+ spin_unlock(&spool->lock);
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
|
||||
+ long delta)
|
||||
+{
|
||||
+ if (!spool)
|
||||
+ return;
|
||||
+
|
||||
+ spin_lock(&spool->lock);
|
||||
+ spool->used_hpages -= delta;
|
||||
+ /* If hugetlbfs_put_super couldn't free spool due to
|
||||
+ * an outstanding quota reference, free it now. */
|
||||
+ unlock_or_release_subpool(spool);
|
||||
+}
|
||||
+
|
||||
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
|
||||
+{
|
||||
+ return HUGETLBFS_SB(inode->i_sb)->spool;
|
||||
+}
|
||||
+
|
||||
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Region tracking -- allows tracking of reservations and instantiated pages
|
||||
* across the pages in a mapping.
|
||||
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
|
||||
*/
|
||||
struct hstate *h = page_hstate(page);
|
||||
int nid = page_to_nid(page);
|
||||
- struct address_space *mapping;
|
||||
+ struct hugepage_subpool *spool =
|
||||
+ (struct hugepage_subpool *)page_private(page);
|
||||
|
||||
- mapping = (struct address_space *) page_private(page);
|
||||
set_page_private(page, 0);
|
||||
page->mapping = NULL;
|
||||
BUG_ON(page_count(page));
|
||||
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
|
||||
enqueue_huge_page(h, page);
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
- if (mapping)
|
||||
- hugetlb_put_quota(mapping, 1);
|
||||
+ hugepage_subpool_put_pages(spool, 1);
|
||||
}
|
||||
|
||||
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
|
||||
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||
/*
|
||||
* Determine if the huge page at addr within the vma has an associated
|
||||
* reservation. Where it does not we will need to logically increase
|
||||
- * reservation and actually increase quota before an allocation can occur.
|
||||
- * Where any new reservation would be required the reservation change is
|
||||
- * prepared, but not committed. Once the page has been quota'd allocated
|
||||
- * an instantiated the change should be committed via vma_commit_reservation.
|
||||
- * No action is required on failure.
|
||||
+ * reservation and actually increase subpool usage before an allocation
|
||||
+ * can occur. Where any new reservation would be required the
|
||||
+ * reservation change is prepared, but not committed. Once the page
|
||||
+ * has been allocated from the subpool and instantiated the change should
|
||||
+ * be committed via vma_commit_reservation. No action is required on
|
||||
+ * failure.
|
||||
*/
|
||||
static long vma_needs_reservation(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
|
||||
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve)
|
||||
{
|
||||
+ struct hugepage_subpool *spool = subpool_vma(vma);
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *page;
|
||||
- struct address_space *mapping = vma->vm_file->f_mapping;
|
||||
- struct inode *inode = mapping->host;
|
||||
long chg;
|
||||
|
||||
/*
|
||||
- * Processes that did not create the mapping will have no reserves and
|
||||
- * will not have accounted against quota. Check that the quota can be
|
||||
- * made before satisfying the allocation
|
||||
- * MAP_NORESERVE mappings may also need pages and quota allocated
|
||||
- * if no reserve mapping overlaps.
|
||||
+ * Processes that did not create the mapping will have no
|
||||
+ * reserves and will not have accounted against subpool
|
||||
+ * limit. Check that the subpool limit can be made before
|
||||
+ * satisfying the allocation MAP_NORESERVE mappings may also
|
||||
+ * need pages and subpool limit allocated allocated if no reserve
|
||||
+ * mapping overlaps.
|
||||
*/
|
||||
chg = vma_needs_reservation(h, vma, addr);
|
||||
if (chg < 0)
|
||||
return ERR_PTR(-VM_FAULT_OOM);
|
||||
if (chg)
|
||||
- if (hugetlb_get_quota(inode->i_mapping, chg))
|
||||
+ if (hugepage_subpool_get_pages(spool, chg))
|
||||
return ERR_PTR(-VM_FAULT_SIGBUS);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||
if (!page) {
|
||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||
if (!page) {
|
||||
- hugetlb_put_quota(inode->i_mapping, chg);
|
||||
+ hugepage_subpool_put_pages(spool, chg);
|
||||
return ERR_PTR(-VM_FAULT_SIGBUS);
|
||||
}
|
||||
}
|
||||
|
||||
- set_page_private(page, (unsigned long) mapping);
|
||||
+ set_page_private(page, (unsigned long)spool);
|
||||
|
||||
vma_commit_reservation(h, vma, addr);
|
||||
|
||||
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
|
||||
{
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct resv_map *reservations = vma_resv_map(vma);
|
||||
+ struct hugepage_subpool *spool = subpool_vma(vma);
|
||||
unsigned long reserve;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
|
||||
|
||||
if (reserve) {
|
||||
hugetlb_acct_memory(h, -reserve);
|
||||
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
|
||||
+ hugepage_subpool_put_pages(spool, reserve);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
address = address & huge_page_mask(h);
|
||||
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
|
||||
+ (vma->vm_pgoff >> PAGE_SHIFT);
|
||||
- mapping = (struct address_space *)page_private(page);
|
||||
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
|
||||
|
||||
/*
|
||||
* Take the mapping lock for the duration of the table walk. As
|
||||
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
|
||||
{
|
||||
long ret, chg;
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
+ struct hugepage_subpool *spool = subpool_inode(inode);
|
||||
|
||||
/*
|
||||
* Only apply hugepage reservation if asked. At fault time, an
|
||||
* attempt will be made for VM_NORESERVE to allocate a page
|
||||
- * and filesystem quota without using reserves
|
||||
+ * without using reserves
|
||||
*/
|
||||
if (vm_flags & VM_NORESERVE)
|
||||
return 0;
|
||||
@@ -2915,19 +2995,19 @@ int hugetlb_reserve_pages(struct inode *inode,
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
- /* There must be enough filesystem quota for the mapping */
|
||||
- if (hugetlb_get_quota(inode->i_mapping, chg)) {
|
||||
+ /* There must be enough pages in the subpool for the mapping */
|
||||
+ if (hugepage_subpool_get_pages(spool, chg)) {
|
||||
ret = -ENOSPC;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check enough hugepages are available for the reservation.
|
||||
- * Hand back the quota if there are not
|
||||
+ * Hand the pages back to the subpool if there are not
|
||||
*/
|
||||
ret = hugetlb_acct_memory(h, chg);
|
||||
if (ret < 0) {
|
||||
- hugetlb_put_quota(inode->i_mapping, chg);
|
||||
+ hugepage_subpool_put_pages(spool, chg);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||
{
|
||||
struct hstate *h = hstate_inode(inode);
|
||||
long chg = region_truncate(&inode->i_mapping->private_list, offset);
|
||||
+ struct hugepage_subpool *spool = subpool_inode(inode);
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
|
||||
spin_unlock(&inode->i_lock);
|
||||
|
||||
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
|
||||
+ hugepage_subpool_put_pages(spool, (chg - freed));
|
||||
hugetlb_acct_memory(h, -(chg - freed));
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
From: Dave Hansen <dave@linux.vnet.ibm.com>
|
||||
Date: Fri, 18 May 2012 11:46:30 -0700
|
||||
Subject: hugetlb: fix resv_map leak in error path
|
||||
|
||||
commit c50ac050811d6485616a193eb0f37bfbd191cc89 upstream.
|
||||
|
||||
When called for anonymous (non-shared) mappings, hugetlb_reserve_pages()
|
||||
does a resv_map_alloc(). It depends on code in hugetlbfs's
|
||||
vm_ops->close() to release that allocation.
|
||||
|
||||
However, in the mmap() failure path, we do a plain unmap_region() without
|
||||
the remove_vma() which actually calls vm_ops->close().
|
||||
|
||||
This is a decent fix. This leak could get reintroduced if new code (say,
|
||||
after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return
|
||||
an error. But, I think it would have to unroll the reservation anyway.
|
||||
|
||||
Christoph's test case:
|
||||
|
||||
http://marc.info/?l=linux-mm&m=133728900729735
|
||||
|
||||
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
|
||||
[Christoph Lameter: I have rediffed the patch against 2.6.32 and 3.2.0.]
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
mm/hugetlb.c | 28 ++++++++++++++++++++++------
|
||||
1 file changed, 22 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -2068,6 +2068,15 @@
|
||||
kref_get(&reservations->refs);
|
||||
}
|
||||
|
||||
+static void resv_map_put(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ struct resv_map *reservations = vma_resv_map(vma);
|
||||
+
|
||||
+ if (!reservations)
|
||||
+ return;
|
||||
+ kref_put(&reservations->refs, resv_map_release);
|
||||
+}
|
||||
+
|
||||
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
|
||||
{
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
@@ -2083,7 +2092,7 @@
|
||||
reserve = (end - start) -
|
||||
region_count(&reservations->regions, start, end);
|
||||
|
||||
- kref_put(&reservations->refs, resv_map_release);
|
||||
+ resv_map_put(vma);
|
||||
|
||||
if (reserve) {
|
||||
hugetlb_acct_memory(h, -reserve);
|
||||
@@ -2884,12 +2893,16 @@
|
||||
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
|
||||
}
|
||||
|
||||
- if (chg < 0)
|
||||
- return chg;
|
||||
+ if (chg < 0) {
|
||||
+ ret = chg;
|
||||
+ goto out_err;
|
||||
+ }
|
||||
|
||||
/* There must be enough filesystem quota for the mapping */
|
||||
- if (hugetlb_get_quota(inode->i_mapping, chg))
|
||||
- return -ENOSPC;
|
||||
+ if (hugetlb_get_quota(inode->i_mapping, chg)) {
|
||||
+ ret = -ENOSPC;
|
||||
+ goto out_err;
|
||||
+ }
|
||||
|
||||
/*
|
||||
* Check enough hugepages are available for the reservation.
|
||||
@@ -2898,7 +2911,7 @@
|
||||
ret = hugetlb_acct_memory(h, chg);
|
||||
if (ret < 0) {
|
||||
hugetlb_put_quota(inode->i_mapping, chg);
|
||||
- return ret;
|
||||
+ goto out_err;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2915,6 +2928,9 @@
|
||||
if (!vma || vma->vm_flags & VM_MAYSHARE)
|
||||
region_add(&inode->i_mapping->private_list, from, to);
|
||||
return 0;
|
||||
+out_err:
|
||||
+ resv_map_put(vma);
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
|
@ -0,0 +1,66 @@
|
|||
From: Dave Hansen <dave@linux.vnet.ibm.com>
|
||||
Date: Wed, 30 May 2012 07:51:07 -0700
|
||||
Subject: mm: fix vma_resv_map() NULL pointer
|
||||
|
||||
commit 4523e1458566a0e8ecfaff90f380dd23acc44d27 upstream.
|
||||
|
||||
hugetlb_reserve_pages() can be used for either normal file-backed
|
||||
hugetlbfs mappings, or MAP_HUGETLB. In the MAP_HUGETLB, semi-anonymous
|
||||
mode, there is not a VMA around. The new call to resv_map_put() assumed
|
||||
that there was, and resulted in a NULL pointer dereference:
|
||||
|
||||
BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
|
||||
IP: vma_resv_map+0x9/0x30
|
||||
PGD 141453067 PUD 1421e1067 PMD 0
|
||||
Oops: 0000 [#1] PREEMPT SMP
|
||||
...
|
||||
Pid: 14006, comm: trinity-child6 Not tainted 3.4.0+ #36
|
||||
RIP: vma_resv_map+0x9/0x30
|
||||
...
|
||||
Process trinity-child6 (pid: 14006, threadinfo ffff8801414e0000, task ffff8801414f26b0)
|
||||
Call Trace:
|
||||
resv_map_put+0xe/0x40
|
||||
hugetlb_reserve_pages+0xa6/0x1d0
|
||||
hugetlb_file_setup+0x102/0x2c0
|
||||
newseg+0x115/0x360
|
||||
ipcget+0x1ce/0x310
|
||||
sys_shmget+0x5a/0x60
|
||||
system_call_fastpath+0x16/0x1b
|
||||
|
||||
This was reported by Dave Jones, but was reproducible with the
|
||||
libhugetlbfs test cases, so shame on me for not running them in the
|
||||
first place.
|
||||
|
||||
With this, the oops is gone, and the output of libhugetlbfs's
|
||||
run_tests.py is identical to plain 3.4 again.
|
||||
|
||||
[ Marked for stable, since this was introduced by commit c50ac050811d
|
||||
("hugetlb: fix resv_map leak in error path") which was also marked for
|
||||
stable ]
|
||||
|
||||
Reported-by: Dave Jones <davej@redhat.com>
|
||||
Cc: Mel Gorman <mel@csn.ul.ie>
|
||||
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
|
||||
Cc: Christoph Lameter <cl@linux.com>
|
||||
Cc: Andrea Arcangeli <aarcange@redhat.com>
|
||||
Cc: Andrew Morton <akpm@linux-foundation.org>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
mm/hugetlb.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
|
||||
index 285a81e..e198831 100644
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -3036,7 +3036,8 @@ int hugetlb_reserve_pages(struct inode *inode,
|
||||
region_add(&inode->i_mapping->private_list, from, to);
|
||||
return 0;
|
||||
out_err:
|
||||
- resv_map_put(vma);
|
||||
+ if (vma)
|
||||
+ resv_map_put(vma);
|
||||
return ret;
|
||||
}
|
||||
|
214
debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch
vendored
Normal file
214
debian/patches/bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch
vendored
Normal file
|
@ -0,0 +1,214 @@
|
|||
From: Andrea Arcangeli <aarcange@redhat.com>
|
||||
Date: Tue, 29 May 2012 15:06:49 -0700
|
||||
Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race
|
||||
condition
|
||||
|
||||
commit 26c191788f18129af0eb32a358cdaea0c7479626 upstream.
|
||||
|
||||
When holding the mmap_sem for reading, pmd_offset_map_lock should only
|
||||
run on a pmd_t that has been read atomically from the pmdp pointer,
|
||||
otherwise we may read only half of it leading to this crash.
|
||||
|
||||
PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic"
|
||||
#0 [f06a9dd8] crash_kexec at c049b5ec
|
||||
#1 [f06a9e2c] oops_end at c083d1c2
|
||||
#2 [f06a9e40] no_context at c0433ded
|
||||
#3 [f06a9e64] bad_area_nosemaphore at c043401a
|
||||
#4 [f06a9e6c] __do_page_fault at c0434493
|
||||
#5 [f06a9eec] do_page_fault at c083eb45
|
||||
#6 [f06a9f04] error_code (via page_fault) at c083c5d5
|
||||
EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP:
|
||||
00000000
|
||||
DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0
|
||||
CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246
|
||||
#7 [f06a9f38] _spin_lock at c083bc14
|
||||
#8 [f06a9f44] sys_mincore at c0507b7d
|
||||
#9 [f06a9fb0] system_call at c083becd
|
||||
start len
|
||||
EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f
|
||||
DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00
|
||||
SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033
|
||||
CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286
|
||||
|
||||
This should be a longstanding bug affecting x86 32bit PAE without THP.
|
||||
Only archs with 64bit large pmd_t and 32bit unsigned long should be
|
||||
affected.
|
||||
|
||||
With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad()
|
||||
would partly hide the bug when the pmd transition from none to stable,
|
||||
by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is
|
||||
enabled a new set of problem arises by the fact could then transition
|
||||
freely in any of the none, pmd_trans_huge or pmd_trans_stable states.
|
||||
So making the barrier in pmd_none_or_trans_huge_or_clear_bad()
|
||||
unconditional isn't good idea and it would be a flakey solution.
|
||||
|
||||
This should be fully fixed by introducing a pmd_read_atomic that reads
|
||||
the pmd in order with THP disabled, or by reading the pmd atomically
|
||||
with cmpxchg8b with THP enabled.
|
||||
|
||||
Luckily this new race condition only triggers in the places that must
|
||||
already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix
|
||||
is localized there but this bug is not related to THP.
|
||||
|
||||
NOTE: this can trigger on x86 32bit systems with PAE enabled with more
|
||||
than 4G of ram, otherwise the high part of the pmd will never risk to be
|
||||
truncated because it would be zero at all times, in turn so hiding the
|
||||
SMP race.
|
||||
|
||||
This bug was discovered and fully debugged by Ulrich, quote:
|
||||
|
||||
----
|
||||
[..]
|
||||
pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and
|
||||
eax.
|
||||
|
||||
496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t
|
||||
*pmd)
|
||||
497 {
|
||||
498 /* depend on compiler for an atomic pmd read */
|
||||
499 pmd_t pmdval = *pmd;
|
||||
|
||||
// edi = pmd pointer
|
||||
0xc0507a74 <sys_mincore+548>: mov 0x8(%esp),%edi
|
||||
...
|
||||
// edx = PTE page table high address
|
||||
0xc0507a84 <sys_mincore+564>: mov 0x4(%edi),%edx
|
||||
...
|
||||
// eax = PTE page table low address
|
||||
0xc0507a8e <sys_mincore+574>: mov (%edi),%eax
|
||||
|
||||
[..]
|
||||
|
||||
Please note that the PMD is not read atomically. These are two "mov"
|
||||
instructions where the high order bits of the PMD entry are fetched
|
||||
first. Hence, the above machine code is prone to the following race.
|
||||
|
||||
- The PMD entry {high|low} is 0x0000000000000000.
|
||||
The "mov" at 0xc0507a84 loads 0x00000000 into edx.
|
||||
|
||||
- A page fault (on another CPU) sneaks in between the two "mov"
|
||||
instructions and instantiates the PMD.
|
||||
|
||||
- The PMD entry {high|low} is now 0x00000003fda38067.
|
||||
The "mov" at 0xc0507a8e loads 0xfda38067 into eax.
|
||||
----
|
||||
|
||||
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
|
||||
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Hugh Dickins <hughd@google.com>
|
||||
Cc: Larry Woodman <lwoodman@redhat.com>
|
||||
Cc: Petr Matousek <pmatouse@redhat.com>
|
||||
Cc: Rik van Riel <riel@redhat.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
|
||||
---
|
||||
arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++
|
||||
include/asm-generic/pgtable.h | 22 +++++++++++++--
|
||||
2 files changed, 70 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
|
||||
index effff47..43876f1 100644
|
||||
--- a/arch/x86/include/asm/pgtable-3level.h
|
||||
+++ b/arch/x86/include/asm/pgtable-3level.h
|
||||
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
|
||||
ptep->pte_low = pte.pte_low;
|
||||
}
|
||||
|
||||
+#define pmd_read_atomic pmd_read_atomic
|
||||
+/*
|
||||
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
|
||||
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
|
||||
+ * where pte_offset_map_lock is called, concurrent page faults are
|
||||
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
|
||||
+ * vs page faults vs MADV_DONTNEED. On the page fault side
|
||||
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
|
||||
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
|
||||
+ * because gcc will not read the 64bit of the pmd atomically. To fix
|
||||
+ * this all places running pmd_offset_map_lock() while holding the
|
||||
+ * mmap_sem in read mode, shall read the pmdp pointer using this
|
||||
+ * function to know if the pmd is null nor not, and in turn to know if
|
||||
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
|
||||
+ * operations.
|
||||
+ *
|
||||
+ * Without THP if the mmap_sem is hold for reading, the
|
||||
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
|
||||
+ * So there's no need of literally reading it atomically.
|
||||
+ *
|
||||
+ * With THP if the mmap_sem is hold for reading, the pmd can become
|
||||
+ * THP or null or point to a pte (and in turn become "stable") at any
|
||||
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
|
||||
+ * with cmpxchg8b.
|
||||
+ */
|
||||
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
||||
+{
|
||||
+ pmdval_t ret;
|
||||
+ u32 *tmp = (u32 *)pmdp;
|
||||
+
|
||||
+ ret = (pmdval_t) (*tmp);
|
||||
+ if (ret) {
|
||||
+ /*
|
||||
+ * If the low part is null, we must not read the high part
|
||||
+ * or we can end up with a partial pmd.
|
||||
+ */
|
||||
+ smp_rmb();
|
||||
+ ret |= ((pmdval_t)*(tmp + 1)) << 32;
|
||||
+ }
|
||||
+
|
||||
+ return (pmd_t) { ret };
|
||||
+}
|
||||
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
||||
+{
|
||||
+ return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
|
||||
+}
|
||||
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
+
|
||||
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
|
||||
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
|
||||
index e2768f1..6f2b45a 100644
|
||||
--- a/include/asm-generic/pgtable.h
|
||||
+++ b/include/asm-generic/pgtable.h
|
||||
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
|
||||
#endif /* __HAVE_ARCH_PMD_WRITE */
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
+#ifndef pmd_read_atomic
|
||||
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
|
||||
+{
|
||||
+ /*
|
||||
+ * Depend on compiler for an atomic pmd read. NOTE: this is
|
||||
+ * only going to work, if the pmdval_t isn't larger than
|
||||
+ * an unsigned long.
|
||||
+ */
|
||||
+ return *pmdp;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* This function is meant to be used by sites walking pagetables with
|
||||
* the mmap_sem hold in read mode to protect against MADV_DONTNEED and
|
||||
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
|
||||
* undefined so behaving like if the pmd was none is safe (because it
|
||||
* can return none anyway). The compiler level barrier() is critically
|
||||
* important to compute the two checks atomically on the same pmdval.
|
||||
+ *
|
||||
+ * For 32bit kernels with a 64bit large pmd_t this automatically takes
|
||||
+ * care of reading the pmd atomically to avoid SMP race conditions
|
||||
+ * against pmd_populate() when the mmap_sem is hold for reading by the
|
||||
+ * caller (a special atomic read not done by "gcc" as in the generic
|
||||
+ * version above, is also needed when THP is disabled because the page
|
||||
+ * fault can populate the pmd from under us).
|
||||
*/
|
||||
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
|
||||
{
|
||||
- /* depend on compiler for an atomic pmd read */
|
||||
- pmd_t pmdval = *pmd;
|
||||
+ pmd_t pmdval = pmd_read_atomic(pmd);
|
||||
/*
|
||||
* The barrier will stabilize the pmdval in a register or on
|
||||
* the stack so that it will stop changing under the code.
|
|
@ -302,3 +302,8 @@
|
|||
|
||||
# AppArmor userland compatibility. This had better be gone in wheezy+1!
|
||||
+ features/all/AppArmor-compatibility-patch-for-v5-interface.patch
|
||||
|
||||
+ bugfix/x86/mm-pmd_read_atomic-fix-32bit-pae-pmd-walk-vs-pmd_populate-smp-race.patch
|
||||
+ bugfix/all/hugetlb-fix-resv_map-leak-in-error-path.patch
|
||||
+ bugfix/all/mm-fix-vma_resv_map-null-pointer.patch
|
||||
+ bugfix/all/hugepages-fix-use-after-free-bug-in-quota-handling.patch
|
||||
|
|
Loading…
Reference in New Issue