2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Generic hugetlb support.
|
|
|
|
* (C) William Irwin, April 2004
|
|
|
|
*/
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/nodemask.h>
|
2005-06-22 04:14:44 +04:00
|
|
|
#include <linux/pagemap.h>
|
2006-01-06 11:10:46 +03:00
|
|
|
#include <linux/mempolicy.h>
|
2006-01-08 12:00:57 +03:00
|
|
|
#include <linux/cpuset.h>
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
#include <linux/mutex.h>
|
2006-01-06 11:10:46 +03:00
|
|
|
|
2005-06-22 04:14:44 +04:00
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
|
|
|
|
#include <linux/hugetlb.h>
|
2006-03-22 11:08:40 +03:00
|
|
|
#include "internal.h"
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
|
2006-06-23 13:03:15 +04:00
|
|
|
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
|
2005-04-17 02:20:36 +04:00
|
|
|
unsigned long max_huge_pages;
|
|
|
|
static struct list_head hugepage_freelists[MAX_NUMNODES];
|
|
|
|
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
|
|
|
static unsigned int free_huge_pages_node[MAX_NUMNODES];
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
/*
|
|
|
|
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
|
|
|
|
*/
|
|
|
|
static DEFINE_SPINLOCK(hugetlb_lock);
|
2005-11-22 08:32:28 +03:00
|
|
|
|
2006-03-22 11:08:51 +03:00
|
|
|
static void clear_huge_page(struct page *page, unsigned long addr)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
might_sleep();
|
|
|
|
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
|
|
|
|
cond_resched();
|
|
|
|
clear_user_highpage(page + i, addr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void copy_huge_page(struct page *dst, struct page *src,
|
2006-12-12 20:14:55 +03:00
|
|
|
unsigned long addr, struct vm_area_struct *vma)
|
2006-03-22 11:08:51 +03:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
might_sleep();
|
|
|
|
for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
|
|
|
|
cond_resched();
|
2006-12-12 20:14:55 +03:00
|
|
|
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
|
2006-03-22 11:08:51 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
static void enqueue_huge_page(struct page *page)
|
|
|
|
{
|
|
|
|
int nid = page_to_nid(page);
|
|
|
|
list_add(&page->lru, &hugepage_freelists[nid]);
|
|
|
|
free_huge_pages++;
|
|
|
|
free_huge_pages_node[nid]++;
|
|
|
|
}
|
|
|
|
|
2006-01-06 11:10:46 +03:00
|
|
|
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
|
|
|
|
unsigned long address)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
int nid = numa_node_id();
|
|
|
|
struct page *page = NULL;
|
2006-01-06 11:10:46 +03:00
|
|
|
struct zonelist *zonelist = huge_zonelist(vma, address);
|
2006-01-06 11:10:45 +03:00
|
|
|
struct zone **z;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2006-01-06 11:10:45 +03:00
|
|
|
for (z = zonelist->zones; *z; z++) {
|
2006-09-26 10:31:55 +04:00
|
|
|
nid = zone_to_nid(*z);
|
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to
explicitly choose between the two variants:
cpuset_zone_allowed_hardwall()
cpuset_zone_allowed_softwall()
Until now, whether or not you got the hardwall flavor depended solely on
whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask
argument.
If you didn't specify __GFP_HARDWALL, you implicitly got the softwall
version.
Unfortunately, this meant that users would end up with the softwall version
without thinking about it. Since only the softwall version might sleep,
this led to bugs with possible sleeping in interrupt context on more than
one occassion.
The hardwall version requires that the current tasks mems_allowed allows
the node of the specified zone (or that you're in interrupt or that
__GFP_THISNODE is set or that you're on a one cpuset system.)
The softwall version, depending on the gfp_mask, might allow a node if it
was allowed in the nearest enclusing cpuset marked mem_exclusive (which
requires taking the cpuset lock 'callback_mutex' to evaluate.)
This patch removes the cpuset_zone_allowed() call, and forces the caller to
explicitly choose between the hardwall and the softwall case.
If the caller wants the gfp_mask to determine this choice, they should (1)
be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the
cpuset_zone_allowed_softwall() routine.
This adds another 100 or 200 bytes to the kernel text space, due to the few
lines of nearly duplicate code at the top of both cpuset_zone_allowed_*
routines. It should save a few instructions executed for the calls that
turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to
set (before the call) then check (within the call) the __GFP_HARDWALL flag.
For the most critical call, from get_page_from_freelist(), the same
instructions are executed as before -- the old cpuset_zone_allowed()
routine it used to call is the same code as the
cpuset_zone_allowed_softwall() routine that it calls now.
Not a perfect win, but seems worth it, to reduce this chance of hitting a
sleeping with irq off complaint again.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-13 11:34:25 +03:00
|
|
|
if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
|
2006-01-08 12:00:57 +03:00
|
|
|
!list_empty(&hugepage_freelists[nid]))
|
2006-01-06 11:10:45 +03:00
|
|
|
break;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2006-01-06 11:10:45 +03:00
|
|
|
|
|
|
|
if (*z) {
|
2005-04-17 02:20:36 +04:00
|
|
|
page = list_entry(hugepage_freelists[nid].next,
|
|
|
|
struct page, lru);
|
|
|
|
list_del(&page->lru);
|
|
|
|
free_huge_pages--;
|
|
|
|
free_huge_pages_node[nid]--;
|
|
|
|
}
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2006-03-22 11:08:56 +03:00
|
|
|
static void free_huge_page(struct page *page)
|
|
|
|
{
|
|
|
|
BUG_ON(page_count(page));
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&page->lru);
|
|
|
|
|
|
|
|
spin_lock(&hugetlb_lock);
|
|
|
|
enqueue_huge_page(page);
|
|
|
|
spin_unlock(&hugetlb_lock);
|
|
|
|
}
|
|
|
|
|
2006-03-22 11:08:08 +03:00
|
|
|
static int alloc_fresh_huge_page(void)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
static int nid = 0;
|
|
|
|
struct page *page;
|
|
|
|
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
|
|
|
|
HUGETLB_PAGE_ORDER);
|
2006-03-22 11:09:10 +03:00
|
|
|
nid = next_node(nid, node_online_map);
|
|
|
|
if (nid == MAX_NUMNODES)
|
|
|
|
nid = first_node(node_online_map);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (page) {
|
2006-12-07 07:33:32 +03:00
|
|
|
set_compound_page_dtor(page, free_huge_page);
|
2005-11-22 08:32:28 +03:00
|
|
|
spin_lock(&hugetlb_lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
nr_huge_pages++;
|
|
|
|
nr_huge_pages_node[page_to_nid(page)]++;
|
2005-11-22 08:32:28 +03:00
|
|
|
spin_unlock(&hugetlb_lock);
|
2006-03-22 11:08:08 +03:00
|
|
|
put_page(page); /* free it into the hugepage allocator */
|
|
|
|
return 1;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2006-03-22 11:08:08 +03:00
|
|
|
return 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2006-03-22 11:08:56 +03:00
|
|
|
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
spin_lock(&hugetlb_lock);
|
2006-06-23 13:03:15 +04:00
|
|
|
if (vma->vm_flags & VM_MAYSHARE)
|
|
|
|
resv_huge_pages--;
|
|
|
|
else if (free_huge_pages <= resv_huge_pages)
|
|
|
|
goto fail;
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
|
|
|
|
|
|
|
page = dequeue_huge_page(vma, addr);
|
|
|
|
if (!page)
|
|
|
|
goto fail;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_unlock(&hugetlb_lock);
|
2006-03-22 11:08:40 +03:00
|
|
|
set_page_refcounted(page);
|
2005-04-17 02:20:36 +04:00
|
|
|
return page;
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
|
|
|
|
2006-06-23 13:03:15 +04:00
|
|
|
fail:
|
2007-05-09 13:33:09 +04:00
|
|
|
if (vma->vm_flags & VM_MAYSHARE)
|
|
|
|
resv_huge_pages++;
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:55 +03:00
|
|
|
spin_unlock(&hugetlb_lock);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
static int __init hugetlb_init(void)
|
|
|
|
{
|
|
|
|
unsigned long i;
|
|
|
|
|
2005-11-07 03:06:55 +03:00
|
|
|
if (HPAGE_SHIFT == 0)
|
|
|
|
return 0;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
for (i = 0; i < MAX_NUMNODES; ++i)
|
|
|
|
INIT_LIST_HEAD(&hugepage_freelists[i]);
|
|
|
|
|
|
|
|
for (i = 0; i < max_huge_pages; ++i) {
|
2006-03-22 11:08:08 +03:00
|
|
|
if (!alloc_fresh_huge_page())
|
2005-04-17 02:20:36 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
max_huge_pages = free_huge_pages = nr_huge_pages = i;
|
|
|
|
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
module_init(hugetlb_init);
|
|
|
|
|
|
|
|
static int __init hugetlb_setup(char *s)
|
|
|
|
{
|
|
|
|
if (sscanf(s, "%lu", &max_huge_pages) <= 0)
|
|
|
|
max_huge_pages = 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("hugepages=", hugetlb_setup);
|
|
|
|
|
2007-05-09 13:33:34 +04:00
|
|
|
static unsigned int cpuset_mems_nr(unsigned int *array)
|
|
|
|
{
|
|
|
|
int node;
|
|
|
|
unsigned int nr = 0;
|
|
|
|
|
|
|
|
for_each_node_mask(node, cpuset_current_mems_allowed)
|
|
|
|
nr += array[node];
|
|
|
|
|
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#ifdef CONFIG_SYSCTL
|
|
|
|
static void update_and_free_page(struct page *page)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
nr_huge_pages--;
|
2006-09-26 10:31:55 +04:00
|
|
|
nr_huge_pages_node[page_to_nid(page)]--;
|
2005-04-17 02:20:36 +04:00
|
|
|
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
|
|
|
|
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
|
|
|
|
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
|
|
|
|
1 << PG_private | 1<< PG_writeback);
|
|
|
|
}
|
2006-03-22 11:08:08 +03:00
|
|
|
page[1].lru.next = NULL;
|
2006-03-22 11:08:40 +03:00
|
|
|
set_page_refcounted(page);
|
2005-04-17 02:20:36 +04:00
|
|
|
__free_pages(page, HUGETLB_PAGE_ORDER);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
static void try_to_free_low(unsigned long count)
|
|
|
|
{
|
2006-09-26 10:31:55 +04:00
|
|
|
int i;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
for (i = 0; i < MAX_NUMNODES; ++i) {
|
|
|
|
struct page *page, *next;
|
|
|
|
list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
|
|
|
|
if (PageHighMem(page))
|
|
|
|
continue;
|
|
|
|
list_del(&page->lru);
|
|
|
|
update_and_free_page(page);
|
|
|
|
free_huge_pages--;
|
2006-09-26 10:31:55 +04:00
|
|
|
free_huge_pages_node[page_to_nid(page)]--;
|
2005-04-17 02:20:36 +04:00
|
|
|
if (count >= nr_huge_pages)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void try_to_free_low(unsigned long count)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static unsigned long set_max_huge_pages(unsigned long count)
|
|
|
|
{
|
|
|
|
while (count > nr_huge_pages) {
|
2006-03-22 11:08:08 +03:00
|
|
|
if (!alloc_fresh_huge_page())
|
2005-04-17 02:20:36 +04:00
|
|
|
return nr_huge_pages;
|
|
|
|
}
|
|
|
|
if (count >= nr_huge_pages)
|
|
|
|
return nr_huge_pages;
|
|
|
|
|
|
|
|
spin_lock(&hugetlb_lock);
|
2006-06-23 13:03:15 +04:00
|
|
|
count = max(count, resv_huge_pages);
|
2005-04-17 02:20:36 +04:00
|
|
|
try_to_free_low(count);
|
|
|
|
while (count < nr_huge_pages) {
|
2006-01-06 11:10:46 +03:00
|
|
|
struct page *page = dequeue_huge_page(NULL, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
if (!page)
|
|
|
|
break;
|
|
|
|
update_and_free_page(page);
|
|
|
|
}
|
|
|
|
spin_unlock(&hugetlb_lock);
|
|
|
|
return nr_huge_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
|
|
|
|
struct file *file, void __user *buffer,
|
|
|
|
size_t *length, loff_t *ppos)
|
|
|
|
{
|
|
|
|
proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
|
|
|
|
max_huge_pages = set_max_huge_pages(max_huge_pages);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
|
|
|
|
int hugetlb_report_meminfo(char *buf)
|
|
|
|
{
|
|
|
|
return sprintf(buf,
|
|
|
|
"HugePages_Total: %5lu\n"
|
|
|
|
"HugePages_Free: %5lu\n"
|
2006-06-23 13:03:15 +04:00
|
|
|
"HugePages_Rsvd: %5lu\n"
|
2005-04-17 02:20:36 +04:00
|
|
|
"Hugepagesize: %5lu kB\n",
|
|
|
|
nr_huge_pages,
|
|
|
|
free_huge_pages,
|
2006-06-23 13:03:15 +04:00
|
|
|
resv_huge_pages,
|
2005-04-17 02:20:36 +04:00
|
|
|
HPAGE_SIZE/1024);
|
|
|
|
}
|
|
|
|
|
|
|
|
int hugetlb_report_node_meminfo(int nid, char *buf)
|
|
|
|
{
|
|
|
|
return sprintf(buf,
|
|
|
|
"Node %d HugePages_Total: %5u\n"
|
|
|
|
"Node %d HugePages_Free: %5u\n",
|
|
|
|
nid, nr_huge_pages_node[nid],
|
|
|
|
nid, free_huge_pages_node[nid]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
|
|
|
|
unsigned long hugetlb_total_pages(void)
|
|
|
|
{
|
|
|
|
return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We cannot handle pagefaults against hugetlb pages at all. They cause
|
|
|
|
* handle_mm_fault() to try to instantiate regular-sized pages in the
|
|
|
|
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
|
|
|
|
* this far.
|
|
|
|
*/
|
|
|
|
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int *unused)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct vm_operations_struct hugetlb_vm_ops = {
|
|
|
|
.nopage = hugetlb_nopage,
|
|
|
|
};
|
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
|
|
|
|
int writable)
|
2005-06-22 04:14:44 +04:00
|
|
|
{
|
|
|
|
pte_t entry;
|
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
if (writable) {
|
2005-06-22 04:14:44 +04:00
|
|
|
entry =
|
|
|
|
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
|
|
|
|
} else {
|
|
|
|
entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
|
|
|
|
}
|
|
|
|
entry = pte_mkyoung(entry);
|
|
|
|
entry = pte_mkhuge(entry);
|
|
|
|
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
static void set_huge_ptep_writable(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *ptep)
|
|
|
|
{
|
|
|
|
pte_t entry;
|
|
|
|
|
|
|
|
entry = pte_mkwrite(pte_mkdirty(*ptep));
|
2007-06-16 21:16:12 +04:00
|
|
|
if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
|
|
|
|
update_mmu_cache(vma, address, entry);
|
|
|
|
lazy_mmu_prot_update(entry);
|
|
|
|
}
|
2006-01-06 11:10:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-06-22 04:14:44 +04:00
|
|
|
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
pte_t *src_pte, *dst_pte, entry;
|
|
|
|
struct page *ptepage;
|
2005-10-20 08:23:43 +04:00
|
|
|
unsigned long addr;
|
2006-01-06 11:10:44 +03:00
|
|
|
int cow;
|
|
|
|
|
|
|
|
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2005-10-20 08:23:43 +04:00
|
|
|
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
|
2005-10-30 04:16:23 +03:00
|
|
|
src_pte = huge_pte_offset(src, addr);
|
|
|
|
if (!src_pte)
|
|
|
|
continue;
|
2005-06-22 04:14:44 +04:00
|
|
|
dst_pte = huge_pte_alloc(dst, addr);
|
|
|
|
if (!dst_pte)
|
|
|
|
goto nomem;
|
2005-10-30 04:16:23 +03:00
|
|
|
spin_lock(&dst->page_table_lock);
|
2005-10-20 08:23:43 +04:00
|
|
|
spin_lock(&src->page_table_lock);
|
2005-10-30 04:16:23 +03:00
|
|
|
if (!pte_none(*src_pte)) {
|
2006-01-06 11:10:44 +03:00
|
|
|
if (cow)
|
|
|
|
ptep_set_wrprotect(src, addr, src_pte);
|
2005-10-20 08:23:43 +04:00
|
|
|
entry = *src_pte;
|
|
|
|
ptepage = pte_page(entry);
|
|
|
|
get_page(ptepage);
|
|
|
|
set_huge_pte_at(dst, addr, dst_pte, entry);
|
|
|
|
}
|
|
|
|
spin_unlock(&src->page_table_lock);
|
2005-10-30 04:16:23 +03:00
|
|
|
spin_unlock(&dst->page_table_lock);
|
2005-06-22 04:14:44 +04:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nomem:
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2006-10-11 12:20:46 +04:00
|
|
|
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end)
|
2005-06-22 04:14:44 +04:00
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
unsigned long address;
|
2005-08-05 22:59:35 +04:00
|
|
|
pte_t *ptep;
|
2005-06-22 04:14:44 +04:00
|
|
|
pte_t pte;
|
|
|
|
struct page *page;
|
2006-10-04 13:15:24 +04:00
|
|
|
struct page *tmp;
|
2006-12-07 07:31:39 +03:00
|
|
|
/*
|
|
|
|
* A page gathering list, protected by per file i_mmap_lock. The
|
|
|
|
* lock is used to avoid list corruption from multiple unmapping
|
|
|
|
* of the same page since we are using page->lru.
|
|
|
|
*/
|
2006-10-04 13:15:24 +04:00
|
|
|
LIST_HEAD(page_list);
|
2005-06-22 04:14:44 +04:00
|
|
|
|
|
|
|
WARN_ON(!is_vm_hugetlb_page(vma));
|
|
|
|
BUG_ON(start & ~HPAGE_MASK);
|
|
|
|
BUG_ON(end & ~HPAGE_MASK);
|
|
|
|
|
2005-10-30 04:16:30 +03:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2005-06-22 04:14:44 +04:00
|
|
|
for (address = start; address < end; address += HPAGE_SIZE) {
|
2005-08-05 22:59:35 +04:00
|
|
|
ptep = huge_pte_offset(mm, address);
|
2005-10-30 04:16:46 +03:00
|
|
|
if (!ptep)
|
2005-08-05 22:59:35 +04:00
|
|
|
continue;
|
|
|
|
|
2006-12-07 07:32:03 +03:00
|
|
|
if (huge_pmd_unshare(mm, &address, ptep))
|
|
|
|
continue;
|
|
|
|
|
2005-08-05 22:59:35 +04:00
|
|
|
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
2005-06-22 04:14:44 +04:00
|
|
|
if (pte_none(pte))
|
|
|
|
continue;
|
2005-08-05 22:59:35 +04:00
|
|
|
|
2005-06-22 04:14:44 +04:00
|
|
|
page = pte_page(pte);
|
2007-02-09 01:20:27 +03:00
|
|
|
if (pte_dirty(pte))
|
|
|
|
set_page_dirty(page);
|
2006-10-04 13:15:24 +04:00
|
|
|
list_add(&page->lru, &page_list);
|
2005-06-22 04:14:44 +04:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2005-10-30 04:16:30 +03:00
|
|
|
flush_tlb_range(vma, start, end);
|
2006-10-04 13:15:24 +04:00
|
|
|
list_for_each_entry_safe(page, tmp, &page_list, lru) {
|
|
|
|
list_del(&page->lru);
|
|
|
|
put_page(page);
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2006-10-11 12:20:46 +04:00
|
|
|
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* It is undesirable to test vma->vm_file as it should be non-null
|
|
|
|
* for valid hugetlb area. However, vm_file will be NULL in the error
|
|
|
|
* cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
|
|
|
|
* do_mmap_pgoff() nullifies vma->vm_file before calling this function
|
|
|
|
* to clean up. Since no pte has actually been setup, it is safe to
|
|
|
|
* do nothing in this case.
|
|
|
|
*/
|
|
|
|
if (vma->vm_file) {
|
|
|
|
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
|
|
|
|
__unmap_hugepage_range(vma, start, end);
|
|
|
|
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *ptep, pte_t pte)
|
|
|
|
{
|
|
|
|
struct page *old_page, *new_page;
|
2006-03-22 11:08:51 +03:00
|
|
|
int avoidcopy;
|
2006-01-06 11:10:44 +03:00
|
|
|
|
|
|
|
old_page = pte_page(pte);
|
|
|
|
|
|
|
|
/* If no-one else is actually using this page, avoid the copy
|
|
|
|
* and just make the page writable */
|
|
|
|
avoidcopy = (page_count(old_page) == 1);
|
|
|
|
if (avoidcopy) {
|
|
|
|
set_huge_ptep_writable(vma, address, ptep);
|
|
|
|
return VM_FAULT_MINOR;
|
|
|
|
}
|
|
|
|
|
|
|
|
page_cache_get(old_page);
|
2006-01-06 11:10:46 +03:00
|
|
|
new_page = alloc_huge_page(vma, address);
|
2006-01-06 11:10:44 +03:00
|
|
|
|
|
|
|
if (!new_page) {
|
|
|
|
page_cache_release(old_page);
|
2006-02-07 23:58:30 +03:00
|
|
|
return VM_FAULT_OOM;
|
2006-01-06 11:10:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(&mm->page_table_lock);
|
2006-12-12 20:14:55 +03:00
|
|
|
copy_huge_page(new_page, old_page, address, vma);
|
2006-01-06 11:10:44 +03:00
|
|
|
spin_lock(&mm->page_table_lock);
|
|
|
|
|
|
|
|
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
|
|
|
|
if (likely(pte_same(*ptep, pte))) {
|
|
|
|
/* Break COW */
|
|
|
|
set_huge_pte_at(mm, address, ptep,
|
|
|
|
make_huge_pte(vma, new_page, 1));
|
|
|
|
/* Make the old page be freed below */
|
|
|
|
new_page = old_page;
|
|
|
|
}
|
|
|
|
page_cache_release(new_page);
|
|
|
|
page_cache_release(old_page);
|
|
|
|
return VM_FAULT_MINOR;
|
|
|
|
}
|
|
|
|
|
2006-01-06 11:10:43 +03:00
|
|
|
int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
2006-01-06 11:10:44 +03:00
|
|
|
unsigned long address, pte_t *ptep, int write_access)
|
2005-10-20 19:24:28 +04:00
|
|
|
{
|
|
|
|
int ret = VM_FAULT_SIGBUS;
|
2005-10-30 04:16:46 +03:00
|
|
|
unsigned long idx;
|
|
|
|
unsigned long size;
|
|
|
|
struct page *page;
|
|
|
|
struct address_space *mapping;
|
2006-01-06 11:10:44 +03:00
|
|
|
pte_t new_pte;
|
2005-10-30 04:16:46 +03:00
|
|
|
|
|
|
|
mapping = vma->vm_file->f_mapping;
|
|
|
|
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
|
|
|
|
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use page lock to guard against racing truncation
|
|
|
|
* before we get page_table_lock.
|
|
|
|
*/
|
2006-01-06 11:10:49 +03:00
|
|
|
retry:
|
|
|
|
page = find_lock_page(mapping, idx);
|
|
|
|
if (!page) {
|
2006-10-28 21:38:43 +04:00
|
|
|
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
|
|
|
|
if (idx >= size)
|
|
|
|
goto out;
|
2006-01-06 11:10:49 +03:00
|
|
|
if (hugetlb_get_quota(mapping))
|
|
|
|
goto out;
|
|
|
|
page = alloc_huge_page(vma, address);
|
|
|
|
if (!page) {
|
|
|
|
hugetlb_put_quota(mapping);
|
2006-02-07 23:58:30 +03:00
|
|
|
ret = VM_FAULT_OOM;
|
2006-01-06 11:10:49 +03:00
|
|
|
goto out;
|
|
|
|
}
|
2006-03-22 11:08:51 +03:00
|
|
|
clear_huge_page(page, address);
|
2005-10-20 19:24:28 +04:00
|
|
|
|
2006-01-06 11:10:49 +03:00
|
|
|
if (vma->vm_flags & VM_SHARED) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
|
|
|
if (err) {
|
|
|
|
put_page(page);
|
|
|
|
hugetlb_put_quota(mapping);
|
|
|
|
if (err == -EEXIST)
|
|
|
|
goto retry;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
lock_page(page);
|
|
|
|
}
|
2006-01-06 11:10:44 +03:00
|
|
|
|
2005-10-20 19:24:28 +04:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2005-10-30 04:16:46 +03:00
|
|
|
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
|
|
|
|
if (idx >= size)
|
|
|
|
goto backout;
|
|
|
|
|
|
|
|
ret = VM_FAULT_MINOR;
|
2006-01-06 11:10:43 +03:00
|
|
|
if (!pte_none(*ptep))
|
2005-10-30 04:16:46 +03:00
|
|
|
goto backout;
|
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
|
|
|
|
&& (vma->vm_flags & VM_SHARED)));
|
|
|
|
set_huge_pte_at(mm, address, ptep, new_pte);
|
|
|
|
|
|
|
|
if (write_access && !(vma->vm_flags & VM_SHARED)) {
|
|
|
|
/* Optimization, do the COW without a second fault */
|
|
|
|
ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
|
|
|
|
}
|
|
|
|
|
2005-10-20 19:24:28 +04:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2005-10-30 04:16:46 +03:00
|
|
|
unlock_page(page);
|
|
|
|
out:
|
2005-10-20 19:24:28 +04:00
|
|
|
return ret;
|
2005-10-30 04:16:46 +03:00
|
|
|
|
|
|
|
backout:
|
|
|
|
spin_unlock(&mm->page_table_lock);
|
|
|
|
hugetlb_put_quota(mapping);
|
|
|
|
unlock_page(page);
|
|
|
|
put_page(page);
|
|
|
|
goto out;
|
2005-10-20 19:24:28 +04:00
|
|
|
}
|
|
|
|
|
2006-01-06 11:10:43 +03:00
|
|
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int write_access)
|
|
|
|
{
|
|
|
|
pte_t *ptep;
|
|
|
|
pte_t entry;
|
2006-01-06 11:10:44 +03:00
|
|
|
int ret;
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
|
2006-01-06 11:10:43 +03:00
|
|
|
|
|
|
|
ptep = huge_pte_alloc(mm, address);
|
|
|
|
if (!ptep)
|
|
|
|
return VM_FAULT_OOM;
|
|
|
|
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
/*
|
|
|
|
* Serialize hugepage allocation and instantiation, so that we don't
|
|
|
|
* get spurious allocation failures if two CPUs race to instantiate
|
|
|
|
* the same page in the page cache.
|
|
|
|
*/
|
|
|
|
mutex_lock(&hugetlb_instantiation_mutex);
|
2006-01-06 11:10:43 +03:00
|
|
|
entry = *ptep;
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
if (pte_none(entry)) {
|
|
|
|
ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
|
|
|
|
mutex_unlock(&hugetlb_instantiation_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
2006-01-06 11:10:43 +03:00
|
|
|
|
2006-01-06 11:10:44 +03:00
|
|
|
ret = VM_FAULT_MINOR;
|
|
|
|
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
|
|
/* Check for a racing update before calling hugetlb_cow */
|
|
|
|
if (likely(pte_same(entry, *ptep)))
|
|
|
|
if (write_access && !pte_write(entry))
|
|
|
|
ret = hugetlb_cow(mm, vma, address, ptep, entry);
|
|
|
|
spin_unlock(&mm->page_table_lock);
|
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache. When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage. However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.
For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage. One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.
The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache. It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us. Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.
This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.
Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM. The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there. Another patch to replace those tests with
something saner for this reason as well as others coming...
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 11:08:53 +03:00
|
|
|
mutex_unlock(&hugetlb_instantiation_mutex);
|
2006-01-06 11:10:44 +03:00
|
|
|
|
|
|
|
return ret;
|
2006-01-06 11:10:43 +03:00
|
|
|
}
|
|
|
|
|
2005-06-22 04:14:44 +04:00
|
|
|
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
|
struct page **pages, struct vm_area_struct **vmas,
|
|
|
|
unsigned long *position, int *length, int i)
|
|
|
|
{
|
2006-03-22 11:09:03 +03:00
|
|
|
unsigned long pfn_offset;
|
|
|
|
unsigned long vaddr = *position;
|
2005-06-22 04:14:44 +04:00
|
|
|
int remainder = *length;
|
|
|
|
|
2005-10-20 08:23:43 +04:00
|
|
|
spin_lock(&mm->page_table_lock);
|
2005-06-22 04:14:44 +04:00
|
|
|
while (vaddr < vma->vm_end && remainder) {
|
2005-10-30 04:16:46 +03:00
|
|
|
pte_t *pte;
|
|
|
|
struct page *page;
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2005-10-30 04:16:46 +03:00
|
|
|
/*
|
|
|
|
* Some archs (sparc64, sh*) have multiple pte_ts to
|
|
|
|
* each hugepage. We have to make * sure we get the
|
|
|
|
* first, for the page indexing below to work.
|
|
|
|
*/
|
|
|
|
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2005-10-30 04:16:46 +03:00
|
|
|
if (!pte || pte_none(*pte)) {
|
|
|
|
int ret;
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2005-10-30 04:16:46 +03:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
|
|
|
ret = hugetlb_fault(mm, vma, vaddr, 0);
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
|
|
if (ret == VM_FAULT_MINOR)
|
|
|
|
continue;
|
2005-06-22 04:14:44 +04:00
|
|
|
|
2005-10-30 04:16:46 +03:00
|
|
|
remainder = 0;
|
|
|
|
if (!i)
|
|
|
|
i = -EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2006-03-22 11:09:03 +03:00
|
|
|
pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
|
|
|
|
page = pte_page(*pte);
|
|
|
|
same_page:
|
2006-03-31 14:29:57 +04:00
|
|
|
if (pages) {
|
|
|
|
get_page(page);
|
2006-03-22 11:09:03 +03:00
|
|
|
pages[i] = page + pfn_offset;
|
2006-03-31 14:29:57 +04:00
|
|
|
}
|
2005-06-22 04:14:44 +04:00
|
|
|
|
|
|
|
if (vmas)
|
|
|
|
vmas[i] = vma;
|
|
|
|
|
|
|
|
vaddr += PAGE_SIZE;
|
2006-03-22 11:09:03 +03:00
|
|
|
++pfn_offset;
|
2005-06-22 04:14:44 +04:00
|
|
|
--remainder;
|
|
|
|
++i;
|
2006-03-22 11:09:03 +03:00
|
|
|
if (vaddr < vma->vm_end && remainder &&
|
|
|
|
pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
|
|
|
|
/*
|
|
|
|
* We use pfn_offset to avoid touching the pageframes
|
|
|
|
* of this compound page.
|
|
|
|
*/
|
|
|
|
goto same_page;
|
|
|
|
}
|
2005-06-22 04:14:44 +04:00
|
|
|
}
|
2005-10-20 08:23:43 +04:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
2005-06-22 04:14:44 +04:00
|
|
|
*length = remainder;
|
|
|
|
*position = vaddr;
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
2006-03-22 11:08:50 +03:00
|
|
|
|
|
|
|
void hugetlb_change_protection(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, unsigned long end, pgprot_t newprot)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
unsigned long start = address;
|
|
|
|
pte_t *ptep;
|
|
|
|
pte_t pte;
|
|
|
|
|
|
|
|
BUG_ON(address >= end);
|
|
|
|
flush_cache_range(vma, address, end);
|
|
|
|
|
2006-12-07 07:32:03 +03:00
|
|
|
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
|
2006-03-22 11:08:50 +03:00
|
|
|
spin_lock(&mm->page_table_lock);
|
|
|
|
for (; address < end; address += HPAGE_SIZE) {
|
|
|
|
ptep = huge_pte_offset(mm, address);
|
|
|
|
if (!ptep)
|
|
|
|
continue;
|
2006-12-07 07:32:03 +03:00
|
|
|
if (huge_pmd_unshare(mm, &address, ptep))
|
|
|
|
continue;
|
2006-03-22 11:08:50 +03:00
|
|
|
if (!pte_none(*ptep)) {
|
|
|
|
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
|
|
|
pte = pte_mkhuge(pte_modify(pte, newprot));
|
|
|
|
set_huge_pte_at(mm, address, ptep, pte);
|
|
|
|
lazy_mmu_prot_update(pte);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&mm->page_table_lock);
|
2006-12-07 07:32:03 +03:00
|
|
|
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
|
2006-03-22 11:08:50 +03:00
|
|
|
|
|
|
|
flush_tlb_range(vma, start, end);
|
|
|
|
}
|
|
|
|
|
2006-06-23 13:03:15 +04:00
|
|
|
struct file_region {
|
|
|
|
struct list_head link;
|
|
|
|
long from;
|
|
|
|
long to;
|
|
|
|
};
|
|
|
|
|
|
|
|
static long region_add(struct list_head *head, long f, long t)
|
|
|
|
{
|
|
|
|
struct file_region *rg, *nrg, *trg;
|
|
|
|
|
|
|
|
/* Locate the region we are either in or before. */
|
|
|
|
list_for_each_entry(rg, head, link)
|
|
|
|
if (f <= rg->to)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Round our left edge to the current segment if it encloses us. */
|
|
|
|
if (f > rg->from)
|
|
|
|
f = rg->from;
|
|
|
|
|
|
|
|
/* Check for and consume any regions we now overlap with. */
|
|
|
|
nrg = rg;
|
|
|
|
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
|
|
|
if (&rg->link == head)
|
|
|
|
break;
|
|
|
|
if (rg->from > t)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* If this area reaches higher then extend our area to
|
|
|
|
* include it completely. If this is not the first area
|
|
|
|
* which we intend to reuse, free it. */
|
|
|
|
if (rg->to > t)
|
|
|
|
t = rg->to;
|
|
|
|
if (rg != nrg) {
|
|
|
|
list_del(&rg->link);
|
|
|
|
kfree(rg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
nrg->from = f;
|
|
|
|
nrg->to = t;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long region_chg(struct list_head *head, long f, long t)
|
|
|
|
{
|
|
|
|
struct file_region *rg, *nrg;
|
|
|
|
long chg = 0;
|
|
|
|
|
|
|
|
/* Locate the region we are before or in. */
|
|
|
|
list_for_each_entry(rg, head, link)
|
|
|
|
if (f <= rg->to)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* If we are below the current region then a new region is required.
|
|
|
|
* Subtle, allocate a new region at the position but make it zero
|
|
|
|
* size such that we can guarentee to record the reservation. */
|
|
|
|
if (&rg->link == head || t < rg->from) {
|
|
|
|
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
|
|
|
if (nrg == 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
nrg->from = f;
|
|
|
|
nrg->to = f;
|
|
|
|
INIT_LIST_HEAD(&nrg->link);
|
|
|
|
list_add(&nrg->link, rg->link.prev);
|
|
|
|
|
|
|
|
return t - f;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Round our left edge to the current segment if it encloses us. */
|
|
|
|
if (f > rg->from)
|
|
|
|
f = rg->from;
|
|
|
|
chg = t - f;
|
|
|
|
|
|
|
|
/* Check for and consume any regions we now overlap with. */
|
|
|
|
list_for_each_entry(rg, rg->link.prev, link) {
|
|
|
|
if (&rg->link == head)
|
|
|
|
break;
|
|
|
|
if (rg->from > t)
|
|
|
|
return chg;
|
|
|
|
|
|
|
|
/* We overlap with this area, if it extends futher than
|
|
|
|
* us then we must extend ourselves. Account for its
|
|
|
|
* existing reservation. */
|
|
|
|
if (rg->to > t) {
|
|
|
|
chg += rg->to - t;
|
|
|
|
t = rg->to;
|
|
|
|
}
|
|
|
|
chg -= rg->to - rg->from;
|
|
|
|
}
|
|
|
|
return chg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long region_truncate(struct list_head *head, long end)
|
|
|
|
{
|
|
|
|
struct file_region *rg, *trg;
|
|
|
|
long chg = 0;
|
|
|
|
|
|
|
|
/* Locate the region we are either in or before. */
|
|
|
|
list_for_each_entry(rg, head, link)
|
|
|
|
if (end <= rg->to)
|
|
|
|
break;
|
|
|
|
if (&rg->link == head)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* If we are in the middle of a region then adjust it. */
|
|
|
|
if (end > rg->from) {
|
|
|
|
chg = rg->to - end;
|
|
|
|
rg->to = end;
|
|
|
|
rg = list_entry(rg->link.next, typeof(*rg), link);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Drop any remaining regions. */
|
|
|
|
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
|
|
|
if (&rg->link == head)
|
|
|
|
break;
|
|
|
|
chg += rg->to - rg->from;
|
|
|
|
list_del(&rg->link);
|
|
|
|
kfree(rg);
|
|
|
|
}
|
|
|
|
return chg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int hugetlb_acct_memory(long delta)
|
|
|
|
{
|
|
|
|
int ret = -ENOMEM;
|
|
|
|
|
|
|
|
spin_lock(&hugetlb_lock);
|
|
|
|
if ((delta + resv_huge_pages) <= free_huge_pages) {
|
|
|
|
resv_huge_pages += delta;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
spin_unlock(&hugetlb_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int hugetlb_reserve_pages(struct inode *inode, long from, long to)
|
|
|
|
{
|
|
|
|
long ret, chg;
|
|
|
|
|
|
|
|
chg = region_chg(&inode->i_mapping->private_list, from, to);
|
|
|
|
if (chg < 0)
|
|
|
|
return chg;
|
2007-05-09 13:33:34 +04:00
|
|
|
/*
|
|
|
|
* When cpuset is configured, it breaks the strict hugetlb page
|
|
|
|
* reservation as the accounting is done on a global variable. Such
|
|
|
|
* reservation is completely rubbish in the presence of cpuset because
|
|
|
|
* the reservation is not checked against page availability for the
|
|
|
|
* current cpuset. Application can still potentially OOM'ed by kernel
|
|
|
|
* with lack of free htlb page in cpuset that the task is in.
|
|
|
|
* Attempt to enforce strict accounting with cpuset is almost
|
|
|
|
* impossible (or too ugly) because cpuset is too fluid that
|
|
|
|
* task or memory node can be dynamically moved between cpusets.
|
|
|
|
*
|
|
|
|
* The change of semantics for shared hugetlb mapping with cpuset is
|
|
|
|
* undesirable. However, in order to preserve some of the semantics,
|
|
|
|
* we fall back to check against current free page availability as
|
|
|
|
* a best attempt and hopefully to minimize the impact of changing
|
|
|
|
* semantics that cpuset has.
|
|
|
|
*/
|
|
|
|
if (chg > cpuset_mems_nr(free_huge_pages_node))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2006-06-23 13:03:15 +04:00
|
|
|
ret = hugetlb_acct_memory(chg);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
region_add(&inode->i_mapping->private_list, from, to);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
|
|
|
{
|
|
|
|
long chg = region_truncate(&inode->i_mapping->private_list, offset);
|
|
|
|
hugetlb_acct_memory(freed - chg);
|
|
|
|
}
|