percpu: drop pcpu_chunk->page[]

percpu core doesn't need to tack all the allocated pages.  It needs to
know whether certain pages are populated and a way to reverse map
address to page when freeing.  This patch drops pcpu_chunk->page[] and
use populated bitmap and vmalloc_to_page() lookup instead.  Using
vmalloc_to_page() exclusively is also possible but complicates first
chunk handling, inflates cache footprint and prevents non-standard
memory allocation for percpu memory.

pcpu_chunk->page[] was used to track each page's allocation and
allowed asymmetric population which happens during failure path;
however, with single bitmap for all units, this is no longer possible.
Bite the bullet and rewrite (de)populate functions so that things are
done in clearly separated steps such that asymmetric population
doesn't happen.  This makes the (de)population process much more
modular and will also ease implementing non-standard memory usage in
the future (e.g. large pages).

This makes @get_page_fn parameter to pcpu_setup_first_chunk()
unnecessary.  The parameter is dropped and all first chunk helpers are
updated accordingly.  Please note that despite the volume most changes
to first chunk helpers are symbol renames for variables which don't
need to be referenced outside of the helper anymore.

This change reduces memory usage and cache footprint of pcpu_chunk.
Now only #unit_pages bits are necessary per chunk.

[ Impact: reduced memory usage and cache footprint for bookkeeping ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
This commit is contained in:
Tejun Heo 2009-07-04 08:11:00 +09:00
Родитель c8a51be4ca
Коммит ce3141a277
3 изменённых файлов: 407 добавлений и 256 удалений

Просмотреть файл

@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
#endif #endif
} }
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;
static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpur_size)
return NULL;
return virt_to_page(pcpur_ptrs[cpu] + off);
}
#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL)
static void __init pcpu_map_range(unsigned long start, unsigned long end, static void __init pcpu_map_range(unsigned long start, unsigned long end,
@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void)
size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
static struct vm_struct vm; static struct vm_struct vm;
unsigned long delta, cpu; unsigned long delta, cpu;
size_t pcpu_unit_size; size_t size_sum, pcpu_unit_size;
size_t ptrs_size; size_t ptrs_size;
void **ptrs;
pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE); PERCPU_DYNAMIC_RESERVE);
dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
pcpur_ptrs = alloc_bootmem(ptrs_size); ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
PCPU_CHUNK_SIZE); PCPU_CHUNK_SIZE);
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), free_bootmem(__pa(ptrs[cpu] + size_sum),
PCPU_CHUNK_SIZE - pcpur_size); PCPU_CHUNK_SIZE - size_sum);
memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); memcpy(ptrs[cpu], __per_cpu_load, static_size);
} }
/* allocate address and map */ /* allocate address and map */
@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void)
start += cpu * PCPU_CHUNK_SIZE; start += cpu * PCPU_CHUNK_SIZE;
end = start + PCPU_CHUNK_SIZE; end = start + PCPU_CHUNK_SIZE;
pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); pcpu_map_range(start, end, virt_to_page(ptrs[cpu]));
} }
pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, pcpu_unit_size = pcpu_setup_first_chunk(static_size,
PERCPU_MODULE_RESERVE, dyn_size, PERCPU_MODULE_RESERVE, dyn_size,
PCPU_CHUNK_SIZE, vm.addr); PCPU_CHUNK_SIZE, vm.addr);
free_bootmem(__pa(pcpur_ptrs), ptrs_size); free_bootmem(__pa(ptrs), ptrs_size);
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {

Просмотреть файл

@ -58,13 +58,12 @@
extern void *pcpu_base_addr; extern void *pcpu_base_addr;
typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern size_t __init pcpu_setup_first_chunk(
size_t static_size, size_t reserved_size, size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t unit_size, ssize_t dyn_size, size_t unit_size,
void *base_addr); void *base_addr);

Просмотреть файл

@ -94,8 +94,7 @@ struct pcpu_chunk {
int map_alloc; /* # of map entries allocated */ int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */ int *map; /* allocation map */
bool immutable; /* no [de]population allowed */ bool immutable; /* no [de]population allowed */
struct page **page; /* points to page array */ unsigned long populated[]; /* populated bitmap */
struct page *page_ar[]; /* #cpus * UNIT_PAGES */
}; };
static int pcpu_unit_pages __read_mostly; static int pcpu_unit_pages __read_mostly;
@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit;
* Synchronization rules. * Synchronization rules.
* *
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
* protects allocation/reclaim paths, chunks and chunk->page arrays. * protects allocation/reclaim paths, chunks, populated bitmap and
* The latter is a spinlock and protects the index data structures - * vmalloc mapping. The latter is a spinlock and protects the index
* chunk slots, chunks and area maps in chunks. * data structures - chunk slots, chunks and area maps in chunks.
* *
* During allocation, pcpu_alloc_mutex is kept locked all the time and * During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory * pcpu_lock is grabbed and released as necessary. All actual memory
@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
} }
static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx) unsigned int cpu, int page_idx)
{ {
return &chunk->page[pcpu_page_idx(cpu, page_idx)]; /* must not be used on pre-mapped chunk */
} WARN_ON(chunk->immutable);
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
int page_idx)
{
return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
} }
/* set the pointer to a chunk in a page struct */ /* set the pointer to a chunk in a page struct */
@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
return (struct pcpu_chunk *)page->index; return (struct pcpu_chunk *)page->index;
} }
static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_zero_bit(chunk->populated, end, *rs);
*re = find_next_bit(chunk->populated, end, *rs + 1);
}
static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_bit(chunk->populated, end, *rs);
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
}
/*
* (Un)populated page region iterators. Iterate over (un)populated
* page regions betwen @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
/** /**
* pcpu_mem_alloc - allocate memory * pcpu_mem_alloc - allocate memory
* @size: bytes to allocate * @size: bytes to allocate
@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
} }
/** /**
* pcpu_unmap - unmap pages out of a pcpu_chunk * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest * @chunk: chunk of interest
* @page_start: page index of the first page to unmap * @bitmapp: output parameter for bitmap
* @page_end: page index of the last page to unmap + 1 * @may_alloc: may allocate the array
* @flush_tlb: whether to flush tlb or not
* *
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * Returns pointer to array of pointers to struct page and bitmap,
* If @flush is true, vcache is flushed before unmapping and tlb * both of which can be indexed with pcpu_page_idx(). The returned
* after. * array is cleared to zero and *@bitmapp is copied from
* @chunk->populated. Note that there is only one array and bitmap
* and access exclusion is the caller's responsibility.
*
* CONTEXT:
* pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
* Otherwise, don't care.
*
* RETURNS:
* Pointer to temp pages array on success, NULL on failure.
*/ */
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
bool flush_tlb) unsigned long **bitmapp,
bool may_alloc)
{
static struct page **pages;
static unsigned long *bitmap;
size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
sizeof(pages[0]);
size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
sizeof(unsigned long);
if (!pages || !bitmap) {
if (may_alloc && !pages)
pages = pcpu_mem_alloc(pages_size);
if (may_alloc && !bitmap)
bitmap = pcpu_mem_alloc(bitmap_size);
if (!pages || !bitmap)
return NULL;
}
memset(pages, 0, pages_size);
bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
*bitmapp = bitmap;
return pages;
}
/**
* pcpu_free_pages - free pages which were allocated for @chunk
* @chunk: chunk pages were allocated for
* @pages: array of pages to be freed, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be freed
* @page_end: page index of the last page to be freed + 1
*
* Free pages [@page_start and @page_end) in @pages for all units.
* The pages were allocated for @chunk.
*/
static void pcpu_free_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page = pages[pcpu_page_idx(cpu, i)];
if (page)
__free_page(page);
}
}
}
/**
* pcpu_alloc_pages - allocates pages for @chunk
* @chunk: target chunk
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
if (!*pagep) {
pcpu_free_pages(chunk, pages, populated,
page_start, page_end);
return -ENOMEM;
}
}
}
return 0;
}
/**
* pcpu_pre_unmap_flush - flush cache prior to unmapping
* @chunk: chunk the regions to be flushed belongs to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages in [@page_start,@page_end) of @chunk are about to be
* unmapped. Flush cache. As each flushing trial can be very
* expensive, issue flush on the whole region at once rather than
* doing it for each cpu. This could be an overkill but is more
* scalable.
*/
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{ {
unsigned int last = num_possible_cpus() - 1; unsigned int last = num_possible_cpus() - 1;
unsigned int cpu;
/* unmap must not be done on immutable chunk */
WARN_ON(chunk->immutable);
/*
* Each flushing trial can be very expensive, issue flush on
* the whole region at once rather than doing it for each cpu.
* This could be an overkill but is more scalable.
*/
flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end)); pcpu_chunk_addr(chunk, last, page_end));
}
for_each_possible_cpu(cpu) static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
unmap_kernel_range_noflush( {
pcpu_chunk_addr(chunk, cpu, page_start), unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
(page_end - page_start) << PAGE_SHIFT); }
/* ditto as flush_cache_vunmap() */ /**
if (flush_tlb) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), * @chunk: chunk of interest
pcpu_chunk_addr(chunk, last, page_end)); * @pages: pages array which can be used to pass information to free
* @populated: populated bitmap
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* Corresponding elements in @pages were cleared by the caller and can
* be used to carry information to pcpu_free_pages() which will be
* called after all unmaps are finished. The caller should call
* proper pre/post flush functions.
*/
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page;
page = pcpu_chunk_page(chunk, cpu, i);
WARN_ON(!page);
pages[pcpu_page_idx(cpu, i)] = page;
}
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
}
for (i = page_start; i < page_end; i++)
__clear_bit(i, populated);
}
/**
* pcpu_post_unmap_tlb_flush - flush TLB after unmapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
* TLB for the regions. This can be skipped if the area is to be
* returned to vmalloc as vmalloc will handle TLB flushing lazily.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
unsigned int last = num_possible_cpus() - 1;
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
} }
static int __pcpu_map_pages(unsigned long addr, struct page **pages, static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
} }
/** /**
* pcpu_map - map pages into a pcpu_chunk * pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest * @chunk: chunk of interest
* @pages: pages array containing pages to be mapped
* @populated: populated bitmap
* @page_start: page index of the first page to map * @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1 * @page_end: page index of the last page to map + 1
* *
* For each cpu, map pages [@page_start,@page_end) into @chunk. * For each cpu, map pages [@page_start,@page_end) into @chunk. The
* vcache is flushed afterwards. * caller is responsible for calling pcpu_post_map_flush() after all
* mappings are complete.
*
* This function is responsible for setting corresponding bits in
* @chunk->populated bitmap and whatever is necessary for reverse
* lookup (addr -> chunk).
*/ */
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) static int pcpu_map_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{ {
unsigned int last = num_possible_cpus() - 1; unsigned int cpu, tcpu;
unsigned int cpu; int i, err;
int err;
/* map must not be done on immutable chunk */
WARN_ON(chunk->immutable);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
pcpu_chunk_pagep(chunk, cpu, page_start), &pages[pcpu_page_idx(cpu, page_start)],
page_end - page_start); page_end - page_start);
if (err < 0) if (err < 0)
return err; goto err;
} }
/* mapping successful, link chunk and mark populated */
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu)
pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
chunk);
__set_bit(i, populated);
}
return 0;
err:
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
return err;
}
/**
* pcpu_post_map_flush - flush cache after mapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been mapped. Flush
* cache.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
unsigned int last = num_possible_cpus() - 1;
/* flush at once, please read comments in pcpu_unmap() */ /* flush at once, please read comments in pcpu_unmap() */
flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end)); pcpu_chunk_addr(chunk, last, page_end));
return 0;
} }
/** /**
@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
* CONTEXT: * CONTEXT:
* pcpu_alloc_mutex. * pcpu_alloc_mutex.
*/ */
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
bool flush)
{ {
int page_start = PFN_DOWN(off); int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size); int page_end = PFN_UP(off + size);
int unmap_start = -1; struct page **pages;
int uninitialized_var(unmap_end); unsigned long *populated;
unsigned int cpu; int rs, re;
int i;
for (i = page_start; i < page_end; i++) { /* quick path, check whether it's empty already */
for_each_possible_cpu(cpu) { pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); if (rs == page_start && re == page_end)
return;
if (!*pagep) break;
continue;
__free_page(*pagep);
/*
* If it's partial depopulation, it might get
* populated or depopulated again. Mark the
* page gone.
*/
*pagep = NULL;
unmap_start = unmap_start < 0 ? i : unmap_start;
unmap_end = i + 1;
}
} }
if (unmap_start >= 0) /* immutable chunks can't be depopulated */
pcpu_unmap(chunk, unmap_start, unmap_end, flush); WARN_ON(chunk->immutable);
/*
* If control reaches here, there must have been at least one
* successful population attempt so the temp pages array must
* be available now.
*/
pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
BUG_ON(!pages);
/* unmap and free */
pcpu_pre_unmap_flush(chunk, page_start, page_end);
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
/* no need to flush tlb, vmalloc will handle it lazily */
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
} }
/** /**
@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
*/ */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{ {
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off); int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size); int page_end = PFN_UP(off + size);
int map_start = -1; int free_end = page_start, unmap_end = page_start;
int uninitialized_var(map_end); struct page **pages;
unsigned long *populated;
unsigned int cpu; unsigned int cpu;
int i; int rs, re, rc;
for (i = page_start; i < page_end; i++) { /* quick path, check whether all pages are already there */
if (pcpu_chunk_page_occupied(chunk, i)) { pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
if (map_start >= 0) { if (rs == page_start && re == page_end)
if (pcpu_map(chunk, map_start, map_end)) goto clear;
goto err; break;
map_start = -1;
}
continue;
}
map_start = map_start < 0 ? i : map_start;
map_end = i + 1;
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
*pagep = alloc_pages_node(cpu_to_node(cpu),
alloc_mask, 0);
if (!*pagep)
goto err;
pcpu_set_page_chunk(*pagep, chunk);
}
} }
if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) /* need to allocate and map pages, this chunk can't be immutable */
goto err; WARN_ON(chunk->immutable);
pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
if (!pages)
return -ENOMEM;
/* alloc and map */
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_free;
free_end = re;
}
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_map_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_unmap;
unmap_end = re;
}
pcpu_post_map_flush(chunk, page_start, page_end);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
clear:
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
size); size);
return 0; return 0;
err:
/* likely under heavy memory pressure, give memory back */ err_unmap:
pcpu_depopulate_chunk(chunk, off, size, true); pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
return -ENOMEM; pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
err_free:
pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
return rc;
} }
static void free_pcpu_chunk(struct pcpu_chunk *chunk) static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->page = chunk->page_ar;
chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
if (!chunk->vm) { if (!chunk->vm) {
@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work)
mutex_unlock(&pcpu_alloc_mutex); mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) { list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk); free_pcpu_chunk(chunk);
} }
} }
@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
/** /**
* pcpu_setup_first_chunk - initialize the first percpu chunk * pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes * @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes, 0 for none * @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
* perpcu area. This function is to be called from arch percpu area * perpcu area. This function is to be called from arch percpu area
* setup path. * setup path.
* *
* @get_page_fn() should return pointer to percpu page given cpu
* number and page number. It should at least return enough pages to
* cover the static area. The returned pages for static area should
* have been initialized with valid data. It can also return pages
* after the static area. NULL return indicates end of pages for the
* cpu. Note that @get_page_fn() must return the same number of pages
* for all cpus.
*
* @reserved_size, if non-zero, specifies the amount of bytes to * @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves * reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved * the first chunk such that it's available only through reserved
@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
* The determined pcpu_unit_size which can be used to initialize * The determined pcpu_unit_size which can be used to initialize
* percpu access. * percpu access.
*/ */
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t unit_size, ssize_t dyn_size, size_t unit_size,
void *base_addr) void *base_addr)
{ {
@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t size_sum = static_size + reserved_size + size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0); (dyn_size >= 0 ? dyn_size : 0);
struct pcpu_chunk *schunk, *dchunk = NULL; struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu; int i;
int i, nr_pages;
/* santiy checks */ /* santiy checks */
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0) if (dyn_size < 0)
dyn_size = pcpu_unit_size - static_size - reserved_size; dyn_size = pcpu_unit_size - static_size - reserved_size;
@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
schunk->vm = &first_vm; schunk->vm = &first_vm;
schunk->map = smap; schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap); schunk->map_alloc = ARRAY_SIZE(smap);
schunk->page = schunk->page_ar;
schunk->immutable = true; schunk->immutable = true;
bitmap_fill(schunk->populated, pcpu_unit_pages);
if (reserved_size) { if (reserved_size) {
schunk->free_size = reserved_size; schunk->free_size = reserved_size;
@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
/* init dynamic chunk if necessary */ /* init dynamic chunk if necessary */
if (dyn_size) { if (dyn_size) {
dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list); INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm; dchunk->vm = &first_vm;
dchunk->map = dmap; dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->map_alloc = ARRAY_SIZE(dmap);
dchunk->page = schunk->page_ar; /* share page map with schunk */
dchunk->immutable = true; dchunk->immutable = true;
bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size; dchunk->map[dchunk->map_used++] = dchunk->free_size;
} }
/* assign pages */
nr_pages = -1;
for_each_possible_cpu(cpu) {
for (i = 0; i < pcpu_unit_pages; i++) {
struct page *page = get_page_fn(cpu, i);
if (!page)
break;
*pcpu_chunk_pagep(schunk, cpu, i) = page;
}
BUG_ON(i < PFN_UP(static_size));
if (nr_pages < 0)
nr_pages = i;
else
BUG_ON(nr_pages != i);
}
/* link the first chunk in */ /* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk; pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_chunk_relocate(pcpu_first_chunk, -1);
@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
return size_sum; return size_sum;
} }
/*
* Embedding first chunk setup helper.
*/
static void *pcpue_ptr __initdata;
static size_t pcpue_size __initdata;
static size_t pcpue_unit_size __initdata;
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpue_size)
return NULL;
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
}
/** /**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @static_size: the size of static percpu area in bytes * @static_size: the size of static percpu area in bytes
@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size) ssize_t dyn_size)
{ {
size_t chunk_size; size_t size_sum, unit_size, chunk_size;
void *base;
unsigned int cpu; unsigned int cpu;
/* determine parameters and allocate */ /* determine parameters and allocate */
pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
chunk_size = pcpue_unit_size * num_possible_cpus(); chunk_size = unit_size * num_possible_cpus();
pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS)); __pa(MAX_DMA_ADDRESS));
if (!pcpue_ptr) { if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for " pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size); "embedding\n", chunk_size);
return -ENOMEM; return -ENOMEM;
@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
/* return the leftover and copy */ /* return the leftover and copy */
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
void *ptr = pcpue_ptr + cpu * pcpue_unit_size; void *ptr = base + cpu * unit_size;
free_bootmem(__pa(ptr + pcpue_size), free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
pcpue_unit_size - pcpue_size);
memcpy(ptr, __per_cpu_load, static_size); memcpy(ptr, __per_cpu_load, static_size);
} }
/* we're ready, commit */ /* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); size_sum >> PAGE_SHIFT, base, static_size);
return pcpu_setup_first_chunk(pcpue_get_page, static_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
reserved_size, dyn_size, unit_size, base);
pcpue_unit_size, pcpue_ptr);
}
/*
* 4k page first chunk setup helper.
*/
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_unit_pages __initdata;
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
if (pageno < pcpu4k_unit_pages)
return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
return NULL;
} }
/** /**
@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
pcpu_fc_populate_pte_fn_t populate_pte_fn) pcpu_fc_populate_pte_fn_t populate_pte_fn)
{ {
static struct vm_struct vm; static struct vm_struct vm;
int unit_pages;
size_t pages_size; size_t pages_size;
struct page **pages;
unsigned int cpu; unsigned int cpu;
int i, j; int i, j;
ssize_t ret; ssize_t ret;
pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
PCPU_MIN_UNIT_SIZE)); PCPU_MIN_UNIT_SIZE));
/* unaligned allocations can't be freed, round up to page size */ /* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pcpu4k_pages[0])); sizeof(pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size); pages = alloc_bootmem(pages_size);
/* allocate pages */ /* allocate pages */
j = 0; j = 0;
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
for (i = 0; i < pcpu4k_unit_pages; i++) { for (i = 0; i < unit_pages; i++) {
void *ptr; void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE); ptr = alloc_fn(cpu, PAGE_SIZE);
@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
"4k page for cpu%u\n", cpu); "4k page for cpu%u\n", cpu);
goto enomem; goto enomem;
} }
pcpu4k_pages[j++] = virt_to_page(ptr); pages[j++] = virt_to_page(ptr);
} }
/* allocate vm area, map the pages and copy static data */ /* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC; vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
vm_area_register_early(&vm, PAGE_SIZE); vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
unsigned long unit_addr = (unsigned long)vm.addr + unsigned long unit_addr = (unsigned long)vm.addr +
(cpu * pcpu4k_unit_pages << PAGE_SHIFT); (cpu * unit_pages << PAGE_SHIFT);
for (i = 0; i < pcpu4k_unit_pages; i++) for (i = 0; i < unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */ /* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages(unit_addr, ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
&pcpu4k_pages[cpu * pcpu4k_unit_pages], unit_pages);
pcpu4k_unit_pages);
if (ret < 0) if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret); panic("failed to map percpu area, err=%zd\n", ret);
@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
/* we're ready, commit */ /* we're ready, commit */
pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
pcpu4k_unit_pages, static_size); unit_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr);
pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
goto out_free_ar; goto out_free_ar;
enomem: enomem:
while (--j >= 0) while (--j >= 0)
free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); free_fn(page_address(pages[j]), PAGE_SIZE);
ret = -ENOMEM; ret = -ENOMEM;
out_free_ar: out_free_ar:
free_bootmem(__pa(pcpu4k_pages), pages_size); free_bootmem(__pa(pages), pages_size);
return ret; return ret;
} }
@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size;
static struct pcpul_ent *pcpul_map; static struct pcpul_ent *pcpul_map;
static struct vm_struct pcpul_vm; static struct vm_struct pcpul_vm;
static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpul_size)
return NULL;
return virt_to_page(pcpul_map[cpu].ptr + off);
}
/** /**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @static_size: the size of static percpu area in bytes * @static_size: the size of static percpu area in bytes
@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
pr_info("PERCPU: Remapped at %p with large pages, static data " pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", pcpul_vm.addr, static_size); "%zu bytes\n", pcpul_vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
reserved_size, dyn_size, pcpul_unit_size, pcpul_unit_size, pcpul_vm.addr);
pcpul_vm.addr);
/* sort pcpul_map array for pcpu_lpage_remapped() */ /* sort pcpul_map array for pcpu_lpage_remapped() */
for (i = 0; i < num_possible_cpus() - 1; i++) for (i = 0; i < num_possible_cpus() - 1; i++)