hugetlb: Try to grow hugetlb pool for MAP_PRIVATE mappings

Because we overcommit hugepages for MAP_PRIVATE mappings, it is possible that the hugetlb pool will be exhausted or completely reserved when a hugepage is needed to satisfy a page fault. Before killing the process in this situation, try to allocate a hugepage directly from the buddy allocator. The explicitly configured pool size becomes a low watermark. When dynamically grown, the allocated huge pages are accounted as a surplus over the watermark. As huge pages are freed on a node, surplus pages are released to the buddy allocator so that the pool will shrink back to the watermark. Surplus accounting also allows for friendlier explicit pool resizing. When shrinking a pool that is fully in-use, increase the surplus so pages will be returned to the buddy allocator as soon as they are freed. When growing a pool that has a surplus, consume the surplus first and then allocate new pages. Signed-off-by: Adam Litke <agl@us.ibm.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Dave McCracken <dave.mccracken@oracle.com> Cc: William Irwin <bill.irwin@oracle.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Ken Chen <kenchen@google.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 01:26:18 -07:00 · 2007-10-16 01:26:18 -07:00 · 7893d1d505
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@ -23,10 +23,12 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 static unsigned long surplus_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
@ -109,15 +111,57 @@ static void update_and_free_page(struct page *page)
 static void free_huge_page(struct page *page)
 {
-	BUG_ON(page_count(page));
+	int nid = page_to_nid(page);
 	BUG_ON(page_count(page));
 	INIT_LIST_HEAD(&page->lru);
 	spin_lock(&hugetlb_lock);
 	if (surplus_huge_pages_node[nid]) {
 		update_and_free_page(page);
 		surplus_huge_pages--;
 		surplus_huge_pages_node[nid]--;
 	} else {
 		enqueue_huge_page(page);
 	}
 	spin_unlock(&hugetlb_lock);
 }
 /*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
 static int adjust_pool_surplus(int delta)
 {
 	static int prev_nid;
 	int nid = prev_nid;
 	int ret = 0;
 	VM_BUG_ON(delta != -1 && delta != 1);
 	do {
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
 		/* To shrink on this node, there must be a surplus page */
 		if (delta < 0 && !surplus_huge_pages_node[nid])
 			continue;
 		/* Surplus cannot exceed the total number of pages */
 		if (delta > 0 && surplus_huge_pages_node[nid] >=
 						nr_huge_pages_node[nid])
 			continue;
 		surplus_huge_pages += delta;
 		surplus_huge_pages_node[nid] += delta;
 		ret = 1;
 		break;
 	} while (nid != prev_nid);
 	prev_nid = nid;
 	return ret;
 }
 static int alloc_fresh_huge_page(void)
 {
 	static int prev_nid;
@ -150,10 +194,30 @@ static int alloc_fresh_huge_page(void)
 	return 0;
 }
 static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 						unsigned long address)
 {
 	struct page *page;
 	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
 	if (page) {
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
 		surplus_huge_pages++;
 		surplus_huge_pages_node[page_to_nid(page)]++;
 		spin_unlock(&hugetlb_lock);
 	}
 	return page;
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr)
 {
-	struct page *page;
+	struct page *page = NULL;
 	spin_lock(&hugetlb_lock);
 	if (vma->vm_flags & VM_MAYSHARE)
@ -173,7 +237,16 @@ fail:
 	if (vma->vm_flags & VM_MAYSHARE)
 		resv_huge_pages++;
 	spin_unlock(&hugetlb_lock);
-	return NULL;
+
 	/*
 	 * Private mappings do not use reserved huge pages so the allocation
 	 * may have failed due to an undersized hugetlb pool.  Try to grab a
 	 * surplus huge page from the buddy allocator.
 	 */
 	if (!(vma->vm_flags & VM_MAYSHARE))
 		page = alloc_buddy_huge_page(vma, addr);
 	return page;
 }
 static int __init hugetlb_init(void)
@ -241,26 +314,62 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
 #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
-	while (count > nr_huge_pages) {
+	unsigned long min_count, ret;
 		if (!alloc_fresh_huge_page())
 			return nr_huge_pages;
 	}
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
 	/*
 	 * Increase the pool size
 	 * First take pages out of surplus state.  Then make up the
 	 * remaining difference by allocating fresh huge pages.
 	 */
 	spin_lock(&hugetlb_lock);
-	count = max(count, resv_huge_pages);
+	while (surplus_huge_pages && count > persistent_huge_pages) {
-	try_to_free_low(count);
+		if (!adjust_pool_surplus(-1))
-	while (count < nr_huge_pages) {
+			break;
 	}
 	while (count > persistent_huge_pages) {
 		int ret;
 		/*
 		 * If this allocation races such that we no longer need the
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
 		spin_unlock(&hugetlb_lock);
 		ret = alloc_fresh_huge_page();
 		spin_lock(&hugetlb_lock);
 		if (!ret)
 			goto out;
 	}
 	if (count >= persistent_huge_pages)
 		goto out;
 	/*
 	 * Decrease the pool size
 	 * First return free pages to the buddy allocator (being careful
 	 * to keep enough around to satisfy reservations).  Then place
 	 * pages into surplus state as needed so the pool will shrink
 	 * to the desired size as pages become free.
 	 */
 	min_count = max(count, resv_huge_pages);
 	try_to_free_low(min_count);
 	while (min_count < persistent_huge_pages) {
 		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
 		update_and_free_page(page);
 	}
 	while (count < persistent_huge_pages) {
 		if (!adjust_pool_surplus(1))
 			break;
 	}
 out:
 	ret = persistent_huge_pages;
 	spin_unlock(&hugetlb_lock);
-	return nr_huge_pages;
+	return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@ -292,10 +401,12 @@ int hugetlb_report_meminfo(char *buf)
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
 			"HugePages_Rsvd:  %5lu\n"
 			"HugePages_Surp:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
 			resv_huge_pages,
 			surplus_huge_pages,
 			HPAGE_SIZE/1024);
 }