452 lines
15 KiB
Diff
452 lines
15 KiB
Diff
From: David Gibson <david@gibson.dropbear.id.au>
|
|
Date: Wed, 21 Mar 2012 16:34:12 -0700
|
|
Subject: [PATCH] hugepages: fix use after free bug in "quota" handling
|
|
|
|
commit 90481622d75715bfcb68501280a917dbfe516029 upstream.
|
|
|
|
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
|
|
general quota handling code, and they don't much resemble its behaviour.
|
|
Rather than being about maintaining limits on on-disk block usage by
|
|
particular users, they are instead about maintaining limits on in-memory
|
|
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
|
|
associated with a particular hugetlbfs filesystem instance.
|
|
|
|
Worse, they work by having callbacks to the hugetlbfs filesystem code from
|
|
the low-level page handling code, in particular from free_huge_page().
|
|
This is a layering violation of itself, but more importantly, if the
|
|
kernel does a get_user_pages() on hugepages (which can happen from KVM
|
|
amongst others), then the free_huge_page() can be delayed until after the
|
|
associated inode has already been freed. If an unmount occurs at the
|
|
wrong time, even the hugetlbfs superblock where the "quota" limits are
|
|
stored may have been freed.
|
|
|
|
Andrew Barry proposed a patch to fix this by having hugepages, instead of
|
|
storing a pointer to their address_space and reaching the superblock from
|
|
there, had the hugepages store pointers directly to the superblock,
|
|
bumping the reference count as appropriate to avoid it being freed.
|
|
Andrew Morton rejected that version, however, on the grounds that it made
|
|
the existing layering violation worse.
|
|
|
|
This is a reworked version of Andrew's patch, which removes the extra, and
|
|
some of the existing, layering violation. It works by introducing the
|
|
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
|
|
finite logical pool of hugepages to allocate from. hugetlbfs now creates
|
|
a subpool for each filesystem instance with a page limit set, and a
|
|
pointer to the subpool gets added to each allocated hugepage, instead of
|
|
the address_space pointer used now. The subpool has its own lifetime and
|
|
is only freed once all pages in it _and_ all other references to it (i.e.
|
|
superblocks) are gone.
|
|
|
|
subpools are optional - a NULL subpool pointer is taken by the code to
|
|
mean that no subpool limits are in effect.
|
|
|
|
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
|
|
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
|
|
http://marc.info/?l=linux-mm&m=126928970510627&w=1
|
|
|
|
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
|
|
alloc_huge_page() - since it already takes the vma, it is not necessary.
|
|
|
|
Signed-off-by: Andrew Barry <abarry@cray.com>
|
|
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
|
Cc: Hugh Dickins <hughd@google.com>
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
Cc: Minchan Kim <minchan.kim@gmail.com>
|
|
Cc: Hillf Danton <dhillf@gmail.com>
|
|
Cc: Paul Mackerras <paulus@samba.org>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
[bwh: Backported to 3.2: adjust context to apply after commit
|
|
c50ac050811d6485616a193eb0f37bfbd191cc89 'hugetlb: fix resv_map leak in
|
|
error path' which should be in 3.2.20]
|
|
---
|
|
--- a/fs/hugetlbfs/inode.c
|
|
+++ b/fs/hugetlbfs/inode.c
|
|
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|
spin_lock(&sbinfo->stat_lock);
|
|
/* If no limits set, just report 0 for max/free/used
|
|
* blocks, like simple_statfs() */
|
|
- if (sbinfo->max_blocks >= 0) {
|
|
- buf->f_blocks = sbinfo->max_blocks;
|
|
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
|
|
+ if (sbinfo->spool) {
|
|
+ long free_pages;
|
|
+
|
|
+ spin_lock(&sbinfo->spool->lock);
|
|
+ buf->f_blocks = sbinfo->spool->max_hpages;
|
|
+ free_pages = sbinfo->spool->max_hpages
|
|
+ - sbinfo->spool->used_hpages;
|
|
+ buf->f_bavail = buf->f_bfree = free_pages;
|
|
+ spin_unlock(&sbinfo->spool->lock);
|
|
buf->f_files = sbinfo->max_inodes;
|
|
buf->f_ffree = sbinfo->free_inodes;
|
|
}
|
|
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
|
|
|
|
if (sbi) {
|
|
sb->s_fs_info = NULL;
|
|
+
|
|
+ if (sbi->spool)
|
|
+ hugepage_put_subpool(sbi->spool);
|
|
+
|
|
kfree(sbi);
|
|
}
|
|
}
|
|
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
|
|
sb->s_fs_info = sbinfo;
|
|
sbinfo->hstate = config.hstate;
|
|
spin_lock_init(&sbinfo->stat_lock);
|
|
- sbinfo->max_blocks = config.nr_blocks;
|
|
- sbinfo->free_blocks = config.nr_blocks;
|
|
sbinfo->max_inodes = config.nr_inodes;
|
|
sbinfo->free_inodes = config.nr_inodes;
|
|
+ sbinfo->spool = NULL;
|
|
+ if (config.nr_blocks != -1) {
|
|
+ sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
|
|
+ if (!sbinfo->spool)
|
|
+ goto out_free;
|
|
+ }
|
|
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
|
sb->s_blocksize = huge_page_size(config.hstate);
|
|
sb->s_blocksize_bits = huge_page_shift(config.hstate);
|
|
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
|
|
sb->s_root = root;
|
|
return 0;
|
|
out_free:
|
|
+ if (sbinfo->spool)
|
|
+ kfree(sbinfo->spool);
|
|
kfree(sbinfo);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
-int hugetlb_get_quota(struct address_space *mapping, long delta)
|
|
-{
|
|
- int ret = 0;
|
|
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
|
|
-
|
|
- if (sbinfo->free_blocks > -1) {
|
|
- spin_lock(&sbinfo->stat_lock);
|
|
- if (sbinfo->free_blocks - delta >= 0)
|
|
- sbinfo->free_blocks -= delta;
|
|
- else
|
|
- ret = -ENOMEM;
|
|
- spin_unlock(&sbinfo->stat_lock);
|
|
- }
|
|
-
|
|
- return ret;
|
|
-}
|
|
-
|
|
-void hugetlb_put_quota(struct address_space *mapping, long delta)
|
|
-{
|
|
- struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
|
|
-
|
|
- if (sbinfo->free_blocks > -1) {
|
|
- spin_lock(&sbinfo->stat_lock);
|
|
- sbinfo->free_blocks += delta;
|
|
- spin_unlock(&sbinfo->stat_lock);
|
|
- }
|
|
-}
|
|
-
|
|
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
|
|
int flags, const char *dev_name, void *data)
|
|
{
|
|
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
|
|
index 7adc492..cf01817 100644
|
|
--- a/include/linux/hugetlb.h
|
|
+++ b/include/linux/hugetlb.h
|
|
@@ -14,6 +14,15 @@ struct user_struct;
|
|
#include <linux/shm.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
+struct hugepage_subpool {
|
|
+ spinlock_t lock;
|
|
+ long count;
|
|
+ long max_hpages, used_hpages;
|
|
+};
|
|
+
|
|
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
|
|
+void hugepage_put_subpool(struct hugepage_subpool *spool);
|
|
+
|
|
int PageHuge(struct page *page);
|
|
|
|
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
|
|
@@ -129,12 +138,11 @@ enum {
|
|
};
|
|
|
|
struct hugetlbfs_sb_info {
|
|
- long max_blocks; /* blocks allowed */
|
|
- long free_blocks; /* blocks free */
|
|
long max_inodes; /* inodes allowed */
|
|
long free_inodes; /* inodes free */
|
|
spinlock_t stat_lock;
|
|
struct hstate *hstate;
|
|
+ struct hugepage_subpool *spool;
|
|
};
|
|
|
|
|
|
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
|
|
extern const struct vm_operations_struct hugetlb_vm_ops;
|
|
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
|
|
struct user_struct **user, int creat_flags);
|
|
-int hugetlb_get_quota(struct address_space *mapping, long delta);
|
|
-void hugetlb_put_quota(struct address_space *mapping, long delta);
|
|
|
|
static inline int is_file_hugepages(struct file *file)
|
|
{
|
|
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
|
|
index b1c3148..afa057a 100644
|
|
--- a/mm/hugetlb.c
|
|
+++ b/mm/hugetlb.c
|
|
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
|
|
*/
|
|
static DEFINE_SPINLOCK(hugetlb_lock);
|
|
|
|
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
|
|
+{
|
|
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
|
|
+
|
|
+ spin_unlock(&spool->lock);
|
|
+
|
|
+ /* If no pages are used, and no other handles to the subpool
|
|
+ * remain, free the subpool the subpool remain */
|
|
+ if (free)
|
|
+ kfree(spool);
|
|
+}
|
|
+
|
|
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
|
|
+{
|
|
+ struct hugepage_subpool *spool;
|
|
+
|
|
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
|
|
+ if (!spool)
|
|
+ return NULL;
|
|
+
|
|
+ spin_lock_init(&spool->lock);
|
|
+ spool->count = 1;
|
|
+ spool->max_hpages = nr_blocks;
|
|
+ spool->used_hpages = 0;
|
|
+
|
|
+ return spool;
|
|
+}
|
|
+
|
|
+void hugepage_put_subpool(struct hugepage_subpool *spool)
|
|
+{
|
|
+ spin_lock(&spool->lock);
|
|
+ BUG_ON(!spool->count);
|
|
+ spool->count--;
|
|
+ unlock_or_release_subpool(spool);
|
|
+}
|
|
+
|
|
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
|
|
+ long delta)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!spool)
|
|
+ return 0;
|
|
+
|
|
+ spin_lock(&spool->lock);
|
|
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
|
|
+ spool->used_hpages += delta;
|
|
+ } else {
|
|
+ ret = -ENOMEM;
|
|
+ }
|
|
+ spin_unlock(&spool->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
|
|
+ long delta)
|
|
+{
|
|
+ if (!spool)
|
|
+ return;
|
|
+
|
|
+ spin_lock(&spool->lock);
|
|
+ spool->used_hpages -= delta;
|
|
+ /* If hugetlbfs_put_super couldn't free spool due to
|
|
+ * an outstanding quota reference, free it now. */
|
|
+ unlock_or_release_subpool(spool);
|
|
+}
|
|
+
|
|
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
|
|
+{
|
|
+ return HUGETLBFS_SB(inode->i_sb)->spool;
|
|
+}
|
|
+
|
|
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
|
|
+{
|
|
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
|
|
+}
|
|
+
|
|
/*
|
|
* Region tracking -- allows tracking of reservations and instantiated pages
|
|
* across the pages in a mapping.
|
|
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
|
|
*/
|
|
struct hstate *h = page_hstate(page);
|
|
int nid = page_to_nid(page);
|
|
- struct address_space *mapping;
|
|
+ struct hugepage_subpool *spool =
|
|
+ (struct hugepage_subpool *)page_private(page);
|
|
|
|
- mapping = (struct address_space *) page_private(page);
|
|
set_page_private(page, 0);
|
|
page->mapping = NULL;
|
|
BUG_ON(page_count(page));
|
|
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
|
|
enqueue_huge_page(h, page);
|
|
}
|
|
spin_unlock(&hugetlb_lock);
|
|
- if (mapping)
|
|
- hugetlb_put_quota(mapping, 1);
|
|
+ hugepage_subpool_put_pages(spool, 1);
|
|
}
|
|
|
|
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
|
|
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
|
|
/*
|
|
* Determine if the huge page at addr within the vma has an associated
|
|
* reservation. Where it does not we will need to logically increase
|
|
- * reservation and actually increase quota before an allocation can occur.
|
|
- * Where any new reservation would be required the reservation change is
|
|
- * prepared, but not committed. Once the page has been quota'd allocated
|
|
- * an instantiated the change should be committed via vma_commit_reservation.
|
|
- * No action is required on failure.
|
|
+ * reservation and actually increase subpool usage before an allocation
|
|
+ * can occur. Where any new reservation would be required the
|
|
+ * reservation change is prepared, but not committed. Once the page
|
|
+ * has been allocated from the subpool and instantiated the change should
|
|
+ * be committed via vma_commit_reservation. No action is required on
|
|
+ * failure.
|
|
*/
|
|
static long vma_needs_reservation(struct hstate *h,
|
|
struct vm_area_struct *vma, unsigned long addr)
|
|
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
|
|
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
|
unsigned long addr, int avoid_reserve)
|
|
{
|
|
+ struct hugepage_subpool *spool = subpool_vma(vma);
|
|
struct hstate *h = hstate_vma(vma);
|
|
struct page *page;
|
|
- struct address_space *mapping = vma->vm_file->f_mapping;
|
|
- struct inode *inode = mapping->host;
|
|
long chg;
|
|
|
|
/*
|
|
- * Processes that did not create the mapping will have no reserves and
|
|
- * will not have accounted against quota. Check that the quota can be
|
|
- * made before satisfying the allocation
|
|
- * MAP_NORESERVE mappings may also need pages and quota allocated
|
|
- * if no reserve mapping overlaps.
|
|
+ * Processes that did not create the mapping will have no
|
|
+ * reserves and will not have accounted against subpool
|
|
+ * limit. Check that the subpool limit can be made before
|
|
+ * satisfying the allocation MAP_NORESERVE mappings may also
|
|
+ * need pages and subpool limit allocated allocated if no reserve
|
|
+ * mapping overlaps.
|
|
*/
|
|
chg = vma_needs_reservation(h, vma, addr);
|
|
if (chg < 0)
|
|
return ERR_PTR(-VM_FAULT_OOM);
|
|
if (chg)
|
|
- if (hugetlb_get_quota(inode->i_mapping, chg))
|
|
+ if (hugepage_subpool_get_pages(spool, chg))
|
|
return ERR_PTR(-VM_FAULT_SIGBUS);
|
|
|
|
spin_lock(&hugetlb_lock);
|
|
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
|
if (!page) {
|
|
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
|
if (!page) {
|
|
- hugetlb_put_quota(inode->i_mapping, chg);
|
|
+ hugepage_subpool_put_pages(spool, chg);
|
|
return ERR_PTR(-VM_FAULT_SIGBUS);
|
|
}
|
|
}
|
|
|
|
- set_page_private(page, (unsigned long) mapping);
|
|
+ set_page_private(page, (unsigned long)spool);
|
|
|
|
vma_commit_reservation(h, vma, addr);
|
|
|
|
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
|
|
{
|
|
struct hstate *h = hstate_vma(vma);
|
|
struct resv_map *reservations = vma_resv_map(vma);
|
|
+ struct hugepage_subpool *spool = subpool_vma(vma);
|
|
unsigned long reserve;
|
|
unsigned long start;
|
|
unsigned long end;
|
|
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
|
|
|
|
if (reserve) {
|
|
hugetlb_acct_memory(h, -reserve);
|
|
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
|
|
+ hugepage_subpool_put_pages(spool, reserve);
|
|
}
|
|
}
|
|
}
|
|
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
address = address & huge_page_mask(h);
|
|
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
|
|
+ (vma->vm_pgoff >> PAGE_SHIFT);
|
|
- mapping = (struct address_space *)page_private(page);
|
|
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
|
|
|
|
/*
|
|
* Take the mapping lock for the duration of the table walk. As
|
|
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
|
|
{
|
|
long ret, chg;
|
|
struct hstate *h = hstate_inode(inode);
|
|
+ struct hugepage_subpool *spool = subpool_inode(inode);
|
|
|
|
/*
|
|
* Only apply hugepage reservation if asked. At fault time, an
|
|
* attempt will be made for VM_NORESERVE to allocate a page
|
|
- * and filesystem quota without using reserves
|
|
+ * without using reserves
|
|
*/
|
|
if (vm_flags & VM_NORESERVE)
|
|
return 0;
|
|
@@ -2915,19 +2995,19 @@ int hugetlb_reserve_pages(struct inode *inode,
|
|
goto out_err;
|
|
}
|
|
|
|
- /* There must be enough filesystem quota for the mapping */
|
|
- if (hugetlb_get_quota(inode->i_mapping, chg)) {
|
|
+ /* There must be enough pages in the subpool for the mapping */
|
|
+ if (hugepage_subpool_get_pages(spool, chg)) {
|
|
ret = -ENOSPC;
|
|
goto out_err;
|
|
}
|
|
|
|
/*
|
|
* Check enough hugepages are available for the reservation.
|
|
- * Hand back the quota if there are not
|
|
+ * Hand the pages back to the subpool if there are not
|
|
*/
|
|
ret = hugetlb_acct_memory(h, chg);
|
|
if (ret < 0) {
|
|
- hugetlb_put_quota(inode->i_mapping, chg);
|
|
+ hugepage_subpool_put_pages(spool, chg);
|
|
goto out_err;
|
|
}
|
|
|
|
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
|
{
|
|
struct hstate *h = hstate_inode(inode);
|
|
long chg = region_truncate(&inode->i_mapping->private_list, offset);
|
|
+ struct hugepage_subpool *spool = subpool_inode(inode);
|
|
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
|
|
spin_unlock(&inode->i_lock);
|
|
|
|
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
|
|
+ hugepage_subpool_put_pages(spool, (chg - freed));
|
|
hugetlb_acct_memory(h, -(chg - freed));
|
|
}
|
|
|