Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu
Conflicts: arch/x86/include/asm/pgtable.h
This commit is contained in:
Коммит
0edcf8d692
|
@ -189,9 +189,21 @@ callback_init(void * kernel_end)
|
|||
|
||||
if (alpha_using_srm) {
|
||||
static struct vm_struct console_remap_vm;
|
||||
unsigned long vaddr = VMALLOC_START;
|
||||
unsigned long nr_pages = 0;
|
||||
unsigned long vaddr;
|
||||
unsigned long i, j;
|
||||
|
||||
/* calculate needed size */
|
||||
for (i = 0; i < crb->map_entries; ++i)
|
||||
nr_pages += crb->map[i].count;
|
||||
|
||||
/* register the vm area */
|
||||
console_remap_vm.flags = VM_ALLOC;
|
||||
console_remap_vm.size = nr_pages << PAGE_SHIFT;
|
||||
vm_area_register_early(&console_remap_vm, PAGE_SIZE);
|
||||
|
||||
vaddr = (unsigned long)consle_remap_vm.addr;
|
||||
|
||||
/* Set up the third level PTEs and update the virtual
|
||||
addresses of the CRB entries. */
|
||||
for (i = 0; i < crb->map_entries; ++i) {
|
||||
|
@ -213,12 +225,6 @@ callback_init(void * kernel_end)
|
|||
vaddr += PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Let vmalloc know that we've allocated some space. */
|
||||
console_remap_vm.flags = VM_ALLOC;
|
||||
console_remap_vm.addr = (void *) VMALLOC_START;
|
||||
console_remap_vm.size = vaddr - VMALLOC_START;
|
||||
vmlist = &console_remap_vm;
|
||||
}
|
||||
|
||||
callback_init_done = 1;
|
||||
|
|
|
@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
|
|||
config QUICKLIST
|
||||
def_bool y
|
||||
|
||||
config HAVE_ARCH_BOOTMEM_NODE
|
||||
config HAVE_ARCH_BOOTMEM
|
||||
def_bool n
|
||||
|
||||
config ARCH_HAVE_MEMORY_PRESENT
|
||||
|
|
|
@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
|
|||
config HAVE_SETUP_PER_CPU_AREA
|
||||
def_bool y
|
||||
|
||||
config HAVE_DYNAMIC_PER_CPU_AREA
|
||||
def_bool y
|
||||
|
||||
config HAVE_CPUMASK_OF_CPU_MAP
|
||||
def_bool X86_64_SMP
|
||||
|
||||
|
@ -1122,7 +1125,7 @@ config NODES_SHIFT
|
|||
Specify the maximum number of NUMA Nodes available on the target
|
||||
system. Increases memory reserved to accomodate various tables.
|
||||
|
||||
config HAVE_ARCH_BOOTMEM_NODE
|
||||
config HAVE_ARCH_BOOTMEM
|
||||
def_bool y
|
||||
depends on X86_32 && NUMA
|
||||
|
||||
|
|
|
@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
|
|||
#endif /* CONFIG_DISCONTIGMEM */
|
||||
|
||||
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
||||
|
||||
/*
|
||||
* Following are macros that are specific to this numa platform.
|
||||
*/
|
||||
#define reserve_bootmem(addr, size, flags) \
|
||||
reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
|
||||
#define alloc_bootmem(x) \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_nopanic(x) \
|
||||
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
|
||||
__pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_low(x) \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
|
||||
#define alloc_bootmem_pages(x) \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_pages_nopanic(x) \
|
||||
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
|
||||
__pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_low_pages(x) \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
|
||||
#define alloc_bootmem_node(pgdat, x) \
|
||||
/* always use node 0 for bootmem on this numa platform */
|
||||
#define alloc_bootmem_core(__bdata, size, align, goal, limit) \
|
||||
({ \
|
||||
struct pglist_data __maybe_unused \
|
||||
*__alloc_bootmem_node__pgdat = (pgdat); \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
|
||||
__pa(MAX_DMA_ADDRESS)); \
|
||||
})
|
||||
#define alloc_bootmem_pages_node(pgdat, x) \
|
||||
({ \
|
||||
struct pglist_data __maybe_unused \
|
||||
*__alloc_bootmem_node__pgdat = (pgdat); \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
|
||||
__pa(MAX_DMA_ADDRESS)); \
|
||||
})
|
||||
#define alloc_bootmem_low_pages_node(pgdat, x) \
|
||||
({ \
|
||||
struct pglist_data __maybe_unused \
|
||||
*__alloc_bootmem_node__pgdat = (pgdat); \
|
||||
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
|
||||
bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \
|
||||
__alloc_bootmem_core(NODE_DATA(0)->bdata, \
|
||||
(size), (align), (goal), (limit)); \
|
||||
})
|
||||
#endif /* CONFIG_NEED_MULTIPLE_NODES */
|
||||
|
||||
|
|
|
@ -43,6 +43,14 @@
|
|||
#else /* ...!ASSEMBLY */
|
||||
|
||||
#include <linux/stringify.h>
|
||||
#include <asm/sections.h>
|
||||
|
||||
#define __addr_to_pcpu_ptr(addr) \
|
||||
(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
|
||||
+ (unsigned long)__per_cpu_start)
|
||||
#define __pcpu_ptr_to_addr(ptr) \
|
||||
(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
|
||||
- (unsigned long)__per_cpu_start)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
|
||||
|
|
|
@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
|
|||
return 1;
|
||||
}
|
||||
|
||||
pmd_t *populate_extra_pmd(unsigned long vaddr);
|
||||
pte_t *populate_extra_pte(unsigned long vaddr);
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
|
|
|
@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
|
|||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
|
||||
data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
|
||||
per_cpu(drv_data, cpu) = data;
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <linux/cpu.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/percpu.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
|
||||
|
@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
|
|||
union irq_ctx {
|
||||
struct thread_info tinfo;
|
||||
u32 stack[THREAD_SIZE/sizeof(u32)];
|
||||
};
|
||||
} __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
|
||||
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
|
||||
static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
|
||||
static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
|
||||
|
||||
static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
|
||||
static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
|
||||
|
||||
static void call_on_stack(void *func, void *stack)
|
||||
{
|
||||
|
@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
|
|||
u32 *isp, arg1, arg2;
|
||||
|
||||
curctx = (union irq_ctx *) current_thread_info();
|
||||
irqctx = hardirq_ctx[smp_processor_id()];
|
||||
irqctx = __get_cpu_var(hardirq_ctx);
|
||||
|
||||
/*
|
||||
* this is where we switch to the IRQ stack. However, if we are
|
||||
|
@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
|
|||
{
|
||||
union irq_ctx *irqctx;
|
||||
|
||||
if (hardirq_ctx[cpu])
|
||||
if (per_cpu(hardirq_ctx, cpu))
|
||||
return;
|
||||
|
||||
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
|
||||
irqctx = &per_cpu(hardirq_stack, cpu);
|
||||
irqctx->tinfo.task = NULL;
|
||||
irqctx->tinfo.exec_domain = NULL;
|
||||
irqctx->tinfo.cpu = cpu;
|
||||
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
|
||||
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
|
||||
|
||||
hardirq_ctx[cpu] = irqctx;
|
||||
per_cpu(hardirq_ctx, cpu) = irqctx;
|
||||
|
||||
irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
|
||||
irqctx = &per_cpu(softirq_stack, cpu);
|
||||
irqctx->tinfo.task = NULL;
|
||||
irqctx->tinfo.exec_domain = NULL;
|
||||
irqctx->tinfo.cpu = cpu;
|
||||
irqctx->tinfo.preempt_count = 0;
|
||||
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
|
||||
|
||||
softirq_ctx[cpu] = irqctx;
|
||||
per_cpu(softirq_ctx, cpu) = irqctx;
|
||||
|
||||
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
|
||||
cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
|
||||
cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
|
||||
}
|
||||
|
||||
void irq_ctx_exit(int cpu)
|
||||
{
|
||||
hardirq_ctx[cpu] = NULL;
|
||||
per_cpu(hardirq_ctx, cpu) = NULL;
|
||||
}
|
||||
|
||||
asmlinkage void do_softirq(void)
|
||||
|
@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
|
|||
|
||||
if (local_softirq_pending()) {
|
||||
curctx = current_thread_info();
|
||||
irqctx = softirq_ctx[smp_processor_id()];
|
||||
irqctx = __get_cpu_var(softirq_ctx);
|
||||
irqctx->tinfo.task = curctx->task;
|
||||
irqctx->tinfo.previous_esp = current_stack_pointer;
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <linux/crash_dump.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/topology.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/setup.h>
|
||||
|
@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
|
|||
};
|
||||
EXPORT_SYMBOL(__per_cpu_offset);
|
||||
|
||||
/**
|
||||
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
|
||||
*
|
||||
* If NUMA is not configured or there is only one NUMA node available,
|
||||
* there is no reason to consider NUMA. This function determines
|
||||
* whether percpu allocation should consider NUMA or not.
|
||||
*
|
||||
* RETURNS:
|
||||
* true if NUMA should be considered; otherwise, false.
|
||||
*/
|
||||
static bool __init pcpu_need_numa(void)
|
||||
{
|
||||
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
||||
pg_data_t *last = NULL;
|
||||
unsigned int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
int node = early_cpu_to_node(cpu);
|
||||
|
||||
if (node_online(node) && NODE_DATA(node) &&
|
||||
last && last != NODE_DATA(node))
|
||||
return true;
|
||||
|
||||
last = NODE_DATA(node);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
|
||||
* @cpu: cpu to allocate for
|
||||
* @size: size allocation in bytes
|
||||
* @align: alignment
|
||||
*
|
||||
* Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
|
||||
* does the right thing for NUMA regardless of the current
|
||||
* configuration.
|
||||
*
|
||||
* RETURNS:
|
||||
* Pointer to the allocated area on success, NULL on failure.
|
||||
*/
|
||||
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
|
||||
unsigned long align)
|
||||
{
|
||||
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
|
||||
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
||||
int node = early_cpu_to_node(cpu);
|
||||
void *ptr;
|
||||
|
||||
if (!node_online(node) || !NODE_DATA(node)) {
|
||||
ptr = __alloc_bootmem_nopanic(size, align, goal);
|
||||
pr_info("cpu %d has no node %d or node-local memory\n",
|
||||
cpu, node);
|
||||
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
|
||||
cpu, size, __pa(ptr));
|
||||
} else {
|
||||
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
|
||||
size, align, goal);
|
||||
pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
|
||||
"%016lx\n", cpu, size, node, __pa(ptr));
|
||||
}
|
||||
return ptr;
|
||||
#else
|
||||
return __alloc_bootmem_nopanic(size, align, goal);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Remap allocator
|
||||
*
|
||||
* This allocator uses PMD page as unit. A PMD page is allocated for
|
||||
* each cpu and each is remapped into vmalloc area using PMD mapping.
|
||||
* As PMD page is quite large, only part of it is used for the first
|
||||
* chunk. Unused part is returned to the bootmem allocator.
|
||||
*
|
||||
* So, the PMD pages are mapped twice - once to the physical mapping
|
||||
* and to the vmalloc area for the first percpu chunk. The double
|
||||
* mapping does add one more PMD TLB entry pressure but still is much
|
||||
* better than only using 4k mappings while still being NUMA friendly.
|
||||
*/
|
||||
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
||||
static size_t pcpur_size __initdata;
|
||||
static void **pcpur_ptrs __initdata;
|
||||
|
||||
static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
|
||||
{
|
||||
size_t off = (size_t)pageno << PAGE_SHIFT;
|
||||
|
||||
if (off >= pcpur_size)
|
||||
return NULL;
|
||||
|
||||
return virt_to_page(pcpur_ptrs[cpu] + off);
|
||||
}
|
||||
|
||||
static ssize_t __init setup_pcpu_remap(size_t static_size)
|
||||
{
|
||||
static struct vm_struct vm;
|
||||
pg_data_t *last;
|
||||
size_t ptrs_size;
|
||||
unsigned int cpu;
|
||||
ssize_t ret;
|
||||
|
||||
/*
|
||||
* If large page isn't supported, there's no benefit in doing
|
||||
* this. Also, on non-NUMA, embedding is better.
|
||||
*/
|
||||
if (!cpu_has_pse || pcpu_need_numa())
|
||||
return -EINVAL;
|
||||
|
||||
last = NULL;
|
||||
for_each_possible_cpu(cpu) {
|
||||
int node = early_cpu_to_node(cpu);
|
||||
|
||||
if (node_online(node) && NODE_DATA(node) &&
|
||||
last && last != NODE_DATA(node))
|
||||
goto proceed;
|
||||
|
||||
last = NODE_DATA(node);
|
||||
}
|
||||
return -EINVAL;
|
||||
|
||||
proceed:
|
||||
/*
|
||||
* Currently supports only single page. Supporting multiple
|
||||
* pages won't be too difficult if it ever becomes necessary.
|
||||
*/
|
||||
pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
|
||||
if (pcpur_size > PMD_SIZE) {
|
||||
pr_warning("PERCPU: static data is larger than large page, "
|
||||
"can't use large page\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* allocate pointer array and alloc large pages */
|
||||
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
|
||||
pcpur_ptrs = alloc_bootmem(ptrs_size);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
|
||||
if (!pcpur_ptrs[cpu])
|
||||
goto enomem;
|
||||
|
||||
/*
|
||||
* Only use pcpur_size bytes and give back the rest.
|
||||
*
|
||||
* Ingo: The 2MB up-rounding bootmem is needed to make
|
||||
* sure the partial 2MB page is still fully RAM - it's
|
||||
* not well-specified to have a PAT-incompatible area
|
||||
* (unmapped RAM, device memory, etc.) in that hole.
|
||||
*/
|
||||
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
|
||||
PMD_SIZE - pcpur_size);
|
||||
|
||||
memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
|
||||
}
|
||||
|
||||
/* allocate address and map */
|
||||
vm.flags = VM_ALLOC;
|
||||
vm.size = num_possible_cpus() * PMD_SIZE;
|
||||
vm_area_register_early(&vm, PMD_SIZE);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
pmd_t *pmd;
|
||||
|
||||
pmd = populate_extra_pmd((unsigned long)vm.addr
|
||||
+ cpu * PMD_SIZE);
|
||||
set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
|
||||
PAGE_KERNEL_LARGE));
|
||||
}
|
||||
|
||||
/* we're ready, commit */
|
||||
pr_info("PERCPU: Remapped at %p with large pages, static data "
|
||||
"%zu bytes\n", vm.addr, static_size);
|
||||
|
||||
ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
|
||||
pcpur_size - static_size, vm.addr, NULL);
|
||||
goto out_free_ar;
|
||||
|
||||
enomem:
|
||||
for_each_possible_cpu(cpu)
|
||||
if (pcpur_ptrs[cpu])
|
||||
free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
|
||||
ret = -ENOMEM;
|
||||
out_free_ar:
|
||||
free_bootmem(__pa(pcpur_ptrs), ptrs_size);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
static ssize_t __init setup_pcpu_remap(size_t static_size)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Embedding allocator
|
||||
*
|
||||
* The first chunk is sized to just contain the static area plus
|
||||
* PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
|
||||
* bootmem allocator and used as-is without being mapped into vmalloc
|
||||
* area. This enables the first chunk to piggy back on the linear
|
||||
* physical PMD mapping and doesn't add any additional pressure to
|
||||
* TLB.
|
||||
*/
|
||||
static void *pcpue_ptr __initdata;
|
||||
static size_t pcpue_unit_size __initdata;
|
||||
|
||||
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
|
||||
{
|
||||
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
|
||||
+ ((size_t)pageno << PAGE_SHIFT));
|
||||
}
|
||||
|
||||
static ssize_t __init setup_pcpu_embed(size_t static_size)
|
||||
{
|
||||
unsigned int cpu;
|
||||
|
||||
/*
|
||||
* If large page isn't supported, there's no benefit in doing
|
||||
* this. Also, embedding allocation doesn't play well with
|
||||
* NUMA.
|
||||
*/
|
||||
if (!cpu_has_pse || pcpu_need_numa())
|
||||
return -EINVAL;
|
||||
|
||||
/* allocate and copy */
|
||||
pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
|
||||
pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
|
||||
pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
|
||||
PAGE_SIZE);
|
||||
if (!pcpue_ptr)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
|
||||
static_size);
|
||||
|
||||
/* we're ready, commit */
|
||||
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
|
||||
pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
|
||||
|
||||
return pcpu_setup_first_chunk(pcpue_get_page, static_size,
|
||||
pcpue_unit_size,
|
||||
pcpue_unit_size - static_size, pcpue_ptr,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* 4k page allocator
|
||||
*
|
||||
* This is the basic allocator. Static percpu area is allocated
|
||||
* page-by-page and most of initialization is done by the generic
|
||||
* setup function.
|
||||
*/
|
||||
static struct page **pcpu4k_pages __initdata;
|
||||
static int pcpu4k_nr_static_pages __initdata;
|
||||
|
||||
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
|
||||
{
|
||||
if (pageno < pcpu4k_nr_static_pages)
|
||||
return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void __init pcpu4k_populate_pte(unsigned long addr)
|
||||
{
|
||||
populate_extra_pte(addr);
|
||||
}
|
||||
|
||||
static ssize_t __init setup_pcpu_4k(size_t static_size)
|
||||
{
|
||||
size_t pages_size;
|
||||
unsigned int cpu;
|
||||
int i, j;
|
||||
ssize_t ret;
|
||||
|
||||
pcpu4k_nr_static_pages = PFN_UP(static_size);
|
||||
|
||||
/* unaligned allocations can't be freed, round up to page size */
|
||||
pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
|
||||
* sizeof(pcpu4k_pages[0]));
|
||||
pcpu4k_pages = alloc_bootmem(pages_size);
|
||||
|
||||
/* allocate and copy */
|
||||
j = 0;
|
||||
for_each_possible_cpu(cpu)
|
||||
for (i = 0; i < pcpu4k_nr_static_pages; i++) {
|
||||
void *ptr;
|
||||
|
||||
ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
|
||||
if (!ptr)
|
||||
goto enomem;
|
||||
|
||||
memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
|
||||
pcpu4k_pages[j++] = virt_to_page(ptr);
|
||||
}
|
||||
|
||||
/* we're ready, commit */
|
||||
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
|
||||
pcpu4k_nr_static_pages, static_size);
|
||||
|
||||
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
|
||||
pcpu4k_populate_pte);
|
||||
goto out_free_ar;
|
||||
|
||||
enomem:
|
||||
while (--j >= 0)
|
||||
free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
|
||||
ret = -ENOMEM;
|
||||
out_free_ar:
|
||||
free_bootmem(__pa(pcpu4k_pages), pages_size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void setup_percpu_segment(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
|
@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
|
|||
*/
|
||||
void __init setup_per_cpu_areas(void)
|
||||
{
|
||||
ssize_t size;
|
||||
char *ptr;
|
||||
int cpu;
|
||||
|
||||
/* Copy section for each CPU (we discard the original) */
|
||||
size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
|
||||
size_t static_size = __per_cpu_end - __per_cpu_start;
|
||||
unsigned int cpu;
|
||||
unsigned long delta;
|
||||
size_t pcpu_unit_size;
|
||||
ssize_t ret;
|
||||
|
||||
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
|
||||
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
|
||||
|
||||
pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
|
||||
/*
|
||||
* Allocate percpu area. If PSE is supported, try to make use
|
||||
* of large page mappings. Please read comments on top of
|
||||
* each allocator for details.
|
||||
*/
|
||||
ret = setup_pcpu_remap(static_size);
|
||||
if (ret < 0)
|
||||
ret = setup_pcpu_embed(static_size);
|
||||
if (ret < 0)
|
||||
ret = setup_pcpu_4k(static_size);
|
||||
if (ret < 0)
|
||||
panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
|
||||
static_size, ret);
|
||||
|
||||
pcpu_unit_size = ret;
|
||||
|
||||
/* alrighty, percpu areas up and running */
|
||||
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
||||
for_each_possible_cpu(cpu) {
|
||||
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
||||
ptr = alloc_bootmem_pages(size);
|
||||
#else
|
||||
int node = early_cpu_to_node(cpu);
|
||||
if (!node_online(node) || !NODE_DATA(node)) {
|
||||
ptr = alloc_bootmem_pages(size);
|
||||
pr_info("cpu %d has no node %d or node-local memory\n",
|
||||
cpu, node);
|
||||
pr_debug("per cpu data for cpu%d at %016lx\n",
|
||||
cpu, __pa(ptr));
|
||||
} else {
|
||||
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
|
||||
pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
|
||||
cpu, node, __pa(ptr));
|
||||
}
|
||||
#endif
|
||||
|
||||
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
|
||||
per_cpu_offset(cpu) = ptr - __per_cpu_start;
|
||||
per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
|
||||
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
|
||||
per_cpu(cpu_number, cpu) = cpu;
|
||||
setup_percpu_segment(cpu);
|
||||
|
|
|
@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
|
|||
return pte_offset_kernel(pmd, 0);
|
||||
}
|
||||
|
||||
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
|
||||
{
|
||||
int pgd_idx = pgd_index(vaddr);
|
||||
int pmd_idx = pmd_index(vaddr);
|
||||
|
||||
return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
|
||||
}
|
||||
|
||||
pte_t * __init populate_extra_pte(unsigned long vaddr)
|
||||
{
|
||||
int pte_idx = pte_index(vaddr);
|
||||
pmd_t *pmd;
|
||||
|
||||
pmd = populate_extra_pmd(vaddr);
|
||||
return one_page_table_init(pmd) + pte_idx;
|
||||
}
|
||||
|
||||
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
|
||||
unsigned long vaddr, pte_t *lastpte)
|
||||
{
|
||||
|
|
|
@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
|
|||
return ptr;
|
||||
}
|
||||
|
||||
void
|
||||
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
|
||||
static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
|
||||
{
|
||||
if (pgd_none(*pgd)) {
|
||||
pud_t *pud = (pud_t *)spp_getpage();
|
||||
pgd_populate(&init_mm, pgd, pud);
|
||||
if (pud != pud_offset(pgd, 0))
|
||||
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
|
||||
pud, pud_offset(pgd, 0));
|
||||
}
|
||||
return pud_offset(pgd, vaddr);
|
||||
}
|
||||
|
||||
static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
|
||||
{
|
||||
if (pud_none(*pud)) {
|
||||
pmd_t *pmd = (pmd_t *) spp_getpage();
|
||||
pud_populate(&init_mm, pud, pmd);
|
||||
if (pmd != pmd_offset(pud, 0))
|
||||
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
|
||||
pmd, pmd_offset(pud, 0));
|
||||
}
|
||||
return pmd_offset(pud, vaddr);
|
||||
}
|
||||
|
||||
static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
|
||||
{
|
||||
if (pmd_none(*pmd)) {
|
||||
pte_t *pte = (pte_t *) spp_getpage();
|
||||
pmd_populate_kernel(&init_mm, pmd, pte);
|
||||
if (pte != pte_offset_kernel(pmd, 0))
|
||||
printk(KERN_ERR "PAGETABLE BUG #02!\n");
|
||||
}
|
||||
return pte_offset_kernel(pmd, vaddr);
|
||||
}
|
||||
|
||||
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
|
||||
{
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pud = pud_page + pud_index(vaddr);
|
||||
if (pud_none(*pud)) {
|
||||
pmd = (pmd_t *) spp_getpage();
|
||||
pud_populate(&init_mm, pud, pmd);
|
||||
if (pmd != pmd_offset(pud, 0)) {
|
||||
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
|
||||
pmd, pmd_offset(pud, 0));
|
||||
return;
|
||||
}
|
||||
}
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
if (pmd_none(*pmd)) {
|
||||
pte = (pte_t *) spp_getpage();
|
||||
pmd_populate_kernel(&init_mm, pmd, pte);
|
||||
if (pte != pte_offset_kernel(pmd, 0)) {
|
||||
printk(KERN_ERR "PAGETABLE BUG #02!\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
pmd = fill_pmd(pud, vaddr);
|
||||
pte = fill_pte(pmd, vaddr);
|
||||
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
set_pte(pte, new_pte);
|
||||
|
||||
/*
|
||||
|
@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
|
|||
__flush_tlb_one(vaddr);
|
||||
}
|
||||
|
||||
void
|
||||
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
|
||||
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud_page;
|
||||
|
@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
|
|||
set_pte_vaddr_pud(pud_page, vaddr, pteval);
|
||||
}
|
||||
|
||||
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
|
||||
pgd = pgd_offset_k(vaddr);
|
||||
pud = fill_pud(pgd, vaddr);
|
||||
return fill_pmd(pud, vaddr);
|
||||
}
|
||||
|
||||
pte_t * __init populate_extra_pte(unsigned long vaddr)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
|
||||
pmd = populate_extra_pmd(vaddr);
|
||||
return fill_pte(pmd, vaddr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create large page table mappings for a range of physical addresses.
|
||||
*/
|
||||
|
|
|
@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
|
|||
if (!bt->sequence)
|
||||
goto err;
|
||||
|
||||
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
|
||||
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
|
||||
if (!bt->msg_data)
|
||||
goto err;
|
||||
|
||||
|
|
|
@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
|
|||
continue;
|
||||
}
|
||||
|
||||
if (!performance || !percpu_ptr(performance, i)) {
|
||||
if (!performance || !per_cpu_ptr(performance, i)) {
|
||||
retval = -EINVAL;
|
||||
continue;
|
||||
}
|
||||
|
||||
pr->performance = percpu_ptr(performance, i);
|
||||
pr->performance = per_cpu_ptr(performance, i);
|
||||
cpumask_set_cpu(i, pr->performance->shared_cpu_map);
|
||||
if (acpi_processor_get_psd(pr)) {
|
||||
retval = -EINVAL;
|
||||
|
|
|
@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
|
|||
#define BOOTMEM_DEFAULT 0
|
||||
#define BOOTMEM_EXCLUSIVE (1<<0)
|
||||
|
||||
extern int reserve_bootmem(unsigned long addr,
|
||||
unsigned long size,
|
||||
int flags);
|
||||
extern int reserve_bootmem_node(pg_data_t *pgdat,
|
||||
unsigned long physaddr,
|
||||
unsigned long size,
|
||||
int flags);
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
||||
extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
|
||||
#endif
|
||||
unsigned long physaddr,
|
||||
unsigned long size,
|
||||
int flags);
|
||||
|
||||
extern void *__alloc_bootmem_nopanic(unsigned long size,
|
||||
extern void *__alloc_bootmem(unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
extern void *__alloc_bootmem(unsigned long size,
|
||||
extern void *__alloc_bootmem_nopanic(unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
extern void *__alloc_bootmem_low(unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
|
||||
unsigned long size,
|
||||
unsigned long align,
|
||||
|
@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
|
|||
unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
extern void *__alloc_bootmem_low(unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
|
||||
unsigned long size,
|
||||
unsigned long align,
|
||||
unsigned long goal);
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
||||
|
||||
#define alloc_bootmem(x) \
|
||||
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_nopanic(x) \
|
||||
__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_low(x) \
|
||||
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
|
||||
#define alloc_bootmem_pages(x) \
|
||||
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_pages_nopanic(x) \
|
||||
__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_low_pages(x) \
|
||||
__alloc_bootmem_low(x, PAGE_SIZE, 0)
|
||||
#define alloc_bootmem_node(pgdat, x) \
|
||||
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_pages_node(pgdat, x) \
|
||||
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
||||
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
|
||||
__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
|
||||
|
||||
#define alloc_bootmem_low(x) \
|
||||
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
|
||||
#define alloc_bootmem_low_pages(x) \
|
||||
__alloc_bootmem_low(x, PAGE_SIZE, 0)
|
||||
#define alloc_bootmem_low_pages_node(pgdat, x) \
|
||||
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
|
||||
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
||||
|
||||
extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
|
||||
int flags);
|
||||
|
|
|
@ -76,52 +76,98 @@
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
|
||||
|
||||
/* minimum unit size, also is the maximum supported allocation size */
|
||||
#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT)
|
||||
|
||||
/*
|
||||
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
|
||||
* back on the first chunk if arch is manually allocating and mapping
|
||||
* it for faster access (as a part of large page mapping for example).
|
||||
* Note that dynamic percpu allocator covers both static and dynamic
|
||||
* areas, so these values are bigger than PERCPU_MODULE_RESERVE.
|
||||
*
|
||||
* On typical configuration with modules, the following values leave
|
||||
* about 8k of free space on the first chunk after boot on both x86_32
|
||||
* and 64 when module support is enabled. When module support is
|
||||
* disabled, it's much tighter.
|
||||
*/
|
||||
#ifndef PERCPU_DYNAMIC_RESERVE
|
||||
# if BITS_PER_LONG > 32
|
||||
# ifdef CONFIG_MODULES
|
||||
# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT)
|
||||
# else
|
||||
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
|
||||
# endif
|
||||
# else
|
||||
# ifdef CONFIG_MODULES
|
||||
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
|
||||
# else
|
||||
# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT)
|
||||
# endif
|
||||
# endif
|
||||
#endif /* PERCPU_DYNAMIC_RESERVE */
|
||||
|
||||
extern void *pcpu_base_addr;
|
||||
|
||||
typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
|
||||
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
|
||||
|
||||
extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
|
||||
size_t static_size, size_t unit_size,
|
||||
size_t free_size, void *base_addr,
|
||||
pcpu_populate_pte_fn_t populate_pte_fn);
|
||||
|
||||
/*
|
||||
* Use this to get to a cpu's version of the per-cpu object
|
||||
* dynamically allocated. Non-atomic access to the current CPU's
|
||||
* version should probably be combined with get_cpu()/put_cpu().
|
||||
*/
|
||||
#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
|
||||
|
||||
#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
|
||||
|
||||
struct percpu_data {
|
||||
void *ptrs[1];
|
||||
};
|
||||
|
||||
#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
|
||||
/*
|
||||
* Use this to get to a cpu's version of the per-cpu object dynamically
|
||||
* allocated. Non-atomic access to the current CPU's version should
|
||||
* probably be combined with get_cpu()/put_cpu().
|
||||
*/
|
||||
#define percpu_ptr(ptr, cpu) \
|
||||
({ \
|
||||
struct percpu_data *__p = __percpu_disguise(ptr); \
|
||||
(__typeof__(ptr))__p->ptrs[(cpu)]; \
|
||||
|
||||
#define per_cpu_ptr(ptr, cpu) \
|
||||
({ \
|
||||
struct percpu_data *__p = __percpu_disguise(ptr); \
|
||||
(__typeof__(ptr))__p->ptrs[(cpu)]; \
|
||||
})
|
||||
|
||||
extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
|
||||
extern void percpu_free(void *__pdata);
|
||||
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
|
||||
|
||||
extern void *__alloc_percpu(size_t size, size_t align);
|
||||
extern void free_percpu(void *__pdata);
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
|
||||
#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
|
||||
|
||||
static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
|
||||
static inline void *__alloc_percpu(size_t size, size_t align)
|
||||
{
|
||||
/*
|
||||
* Can't easily make larger alignment work with kmalloc. WARN
|
||||
* on it. Larger alignment should only be used for module
|
||||
* percpu sections on SMP for which this path isn't used.
|
||||
*/
|
||||
WARN_ON_ONCE(align > __alignof__(unsigned long long));
|
||||
return kzalloc(size, gfp);
|
||||
}
|
||||
|
||||
static inline void percpu_free(void *__pdata)
|
||||
static inline void free_percpu(void *p)
|
||||
{
|
||||
kfree(__pdata);
|
||||
kfree(p);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#define percpu_alloc_mask(size, gfp, mask) \
|
||||
__percpu_alloc_mask((size), (gfp), &(mask))
|
||||
|
||||
#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
|
||||
|
||||
/* (legacy) interface for use without CPU hotplug handling */
|
||||
|
||||
#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \
|
||||
cpu_possible_map)
|
||||
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type))
|
||||
#define free_percpu(ptr) percpu_free((ptr))
|
||||
#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu))
|
||||
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
|
||||
__alignof__(type))
|
||||
|
||||
#endif /* __LINUX_PERCPU_H */
|
||||
|
|
|
@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
|
|||
|
||||
extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
|
||||
struct page ***pages);
|
||||
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
|
||||
pgprot_t prot, struct page **pages);
|
||||
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
|
||||
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
|
||||
|
||||
/* Allocate/destroy a 'vmalloc' VM area. */
|
||||
|
@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
|
|||
*/
|
||||
extern rwlock_t vmlist_lock;
|
||||
extern struct vm_struct *vmlist;
|
||||
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
|
||||
|
||||
#endif /* _LINUX_VMALLOC_H */
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
#include <linux/tracepoint.h>
|
||||
#include <linux/ftrace.h>
|
||||
#include <linux/async.h>
|
||||
#include <linux/percpu.h>
|
||||
|
||||
#if 0
|
||||
#define DEBUGP printk
|
||||
|
@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
|
||||
|
||||
static void *percpu_modalloc(unsigned long size, unsigned long align,
|
||||
const char *name)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
if (align > PAGE_SIZE) {
|
||||
printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
|
||||
name, align, PAGE_SIZE);
|
||||
align = PAGE_SIZE;
|
||||
}
|
||||
|
||||
ptr = __alloc_percpu(size, align);
|
||||
if (!ptr)
|
||||
printk(KERN_WARNING
|
||||
"Could not allocate %lu bytes percpu data\n", size);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void percpu_modfree(void *freeme)
|
||||
{
|
||||
free_percpu(freeme);
|
||||
}
|
||||
|
||||
#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
|
||||
|
||||
/* Number of blocks used and allocated. */
|
||||
static unsigned int pcpu_num_used, pcpu_num_allocated;
|
||||
/* Size of each block. -ve means used. */
|
||||
|
@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
|
|||
}
|
||||
}
|
||||
|
||||
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
|
||||
Elf_Shdr *sechdrs,
|
||||
const char *secstrings)
|
||||
{
|
||||
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
|
||||
}
|
||||
|
||||
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
|
||||
}
|
||||
|
||||
static int percpu_modinit(void)
|
||||
{
|
||||
pcpu_num_used = 2;
|
||||
|
@ -513,7 +527,26 @@ static int percpu_modinit(void)
|
|||
return 0;
|
||||
}
|
||||
__initcall(percpu_modinit);
|
||||
|
||||
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
|
||||
|
||||
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
|
||||
Elf_Shdr *sechdrs,
|
||||
const char *secstrings)
|
||||
{
|
||||
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
|
||||
}
|
||||
|
||||
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
|
||||
}
|
||||
|
||||
#else /* ... !CONFIG_SMP */
|
||||
|
||||
static inline void *percpu_modalloc(unsigned long size, unsigned long align,
|
||||
const char *name)
|
||||
{
|
||||
|
@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
|
|||
/* pcpusec should be 0, and size of that section should be 0. */
|
||||
BUG_ON(size != 0);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#define MODINFO_ATTR(field) \
|
||||
|
|
|
@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|||
|
||||
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
|
||||
{
|
||||
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
u64 data;
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
|
@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
|
|||
|
||||
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
|
||||
{
|
||||
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
|
||||
#ifndef CONFIG_64BIT
|
||||
/*
|
||||
|
@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|||
ca = task_ca(tsk);
|
||||
|
||||
for (; ca; ca = ca->parent) {
|
||||
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
|
||||
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
||||
*cpuusage += cputime;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
|
|||
* doesn't hit this CPU until we're ready. */
|
||||
get_cpu();
|
||||
for_each_online_cpu(i) {
|
||||
sm_work = percpu_ptr(stop_machine_work, i);
|
||||
sm_work = per_cpu_ptr(stop_machine_work, i);
|
||||
INIT_WORK(sm_work, stop_cpu);
|
||||
queue_work_on(i, stop_machine_wq, sm_work);
|
||||
}
|
||||
|
|
|
@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
|
|||
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
|
||||
obj-$(CONFIG_FS_XIP) += filemap_xip.o
|
||||
obj-$(CONFIG_MIGRATION) += migrate.o
|
||||
ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
|
||||
obj-$(CONFIG_SMP) += percpu.o
|
||||
else
|
||||
obj-$(CONFIG_SMP) += allocpercpu.o
|
||||
endif
|
||||
obj-$(CONFIG_QUICKLIST) += quicklist.o
|
||||
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
|
||||
|
|
|
@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
|
|||
__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
|
||||
|
||||
/**
|
||||
* percpu_alloc_mask - initial setup of per-cpu data
|
||||
* alloc_percpu - initial setup of per-cpu data
|
||||
* @size: size of per-cpu object
|
||||
* @gfp: may sleep or not etc.
|
||||
* @mask: populate per-data for cpu's selected through mask bits
|
||||
* @align: alignment
|
||||
*
|
||||
* Populating per-cpu data for all online cpu's would be a typical use case,
|
||||
* which is simplified by the percpu_alloc() wrapper.
|
||||
* Per-cpu objects are populated with zeroed buffers.
|
||||
* Allocate dynamic percpu area. Percpu objects are populated with
|
||||
* zeroed buffers.
|
||||
*/
|
||||
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
|
||||
void *__alloc_percpu(size_t size, size_t align)
|
||||
{
|
||||
/*
|
||||
* We allocate whole cache lines to avoid false sharing
|
||||
*/
|
||||
size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
|
||||
void *pdata = kzalloc(sz, gfp);
|
||||
void *pdata = kzalloc(sz, GFP_KERNEL);
|
||||
void *__pdata = __percpu_disguise(pdata);
|
||||
|
||||
/*
|
||||
* Can't easily make larger alignment work with kmalloc. WARN
|
||||
* on it. Larger alignment should only be used for module
|
||||
* percpu sections on SMP for which this path isn't used.
|
||||
*/
|
||||
WARN_ON_ONCE(align > __alignof__(unsigned long long));
|
||||
|
||||
if (unlikely(!pdata))
|
||||
return NULL;
|
||||
if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
|
||||
if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
|
||||
&cpu_possible_map)))
|
||||
return __pdata;
|
||||
kfree(pdata);
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
|
||||
EXPORT_SYMBOL_GPL(__alloc_percpu);
|
||||
|
||||
/**
|
||||
* percpu_free - final cleanup of per-cpu data
|
||||
* free_percpu - final cleanup of per-cpu data
|
||||
* @__pdata: object to clean up
|
||||
*
|
||||
* We simply clean up any per-cpu object left. No need for the client to
|
||||
* track and specify through a bis mask which per-cpu objects are to free.
|
||||
*/
|
||||
void percpu_free(void *__pdata)
|
||||
void free_percpu(void *__pdata)
|
||||
{
|
||||
if (unlikely(!__pdata))
|
||||
return;
|
||||
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
|
||||
kfree(__percpu_disguise(__pdata));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_free);
|
||||
EXPORT_SYMBOL_GPL(free_percpu);
|
||||
|
|
14
mm/bootmem.c
14
mm/bootmem.c
|
@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
|
|||
|
||||
static int bootmem_debug;
|
||||
|
||||
/*
|
||||
* If an arch needs to apply workarounds to bootmem allocation, it can
|
||||
* set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
|
||||
* __alloc_bootmem_core().
|
||||
*/
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM
|
||||
#define alloc_bootmem_core(bdata, size, align, goal, limit) \
|
||||
__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
|
||||
#endif
|
||||
|
||||
static int __init bootmem_debug_setup(char *buf)
|
||||
{
|
||||
bootmem_debug = 1;
|
||||
|
@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
|
|||
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
|
||||
/**
|
||||
* reserve_bootmem - mark a page range as usable
|
||||
* @addr: starting address of the range
|
||||
|
@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
|
|||
|
||||
return mark_bootmem(start, end, 1, flags);
|
||||
}
|
||||
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
|
||||
|
||||
static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
|
||||
unsigned long step)
|
||||
|
@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
|
|||
return ALIGN(base + off, align) - base;
|
||||
}
|
||||
|
||||
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
|
||||
static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
|
||||
unsigned long size, unsigned long align,
|
||||
unsigned long goal, unsigned long limit)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,979 @@
|
|||
/*
|
||||
* linux/mm/percpu.c - percpu memory allocator
|
||||
*
|
||||
* Copyright (C) 2009 SUSE Linux Products GmbH
|
||||
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
|
||||
*
|
||||
* This file is released under the GPLv2.
|
||||
*
|
||||
* This is percpu allocator which can handle both static and dynamic
|
||||
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
|
||||
* chunk is consisted of num_possible_cpus() units and the first chunk
|
||||
* is used for static percpu variables in the kernel image (special
|
||||
* boot time alloc/init handling necessary as these areas need to be
|
||||
* brought up before allocation services are running). Unit grows as
|
||||
* necessary and all units grow or shrink in unison. When a chunk is
|
||||
* filled up, another chunk is allocated. ie. in vmalloc area
|
||||
*
|
||||
* c0 c1 c2
|
||||
* ------------------- ------------------- ------------
|
||||
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
|
||||
* ------------------- ...... ------------------- .... ------------
|
||||
*
|
||||
* Allocation is done in offset-size areas of single unit space. Ie,
|
||||
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
|
||||
* c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
|
||||
* percpu base registers UNIT_SIZE apart.
|
||||
*
|
||||
* There are usually many small percpu allocations many of them as
|
||||
* small as 4 bytes. The allocator organizes chunks into lists
|
||||
* according to free size and tries to allocate from the fullest one.
|
||||
* Each chunk keeps the maximum contiguous area size hint which is
|
||||
* guaranteed to be eqaul to or larger than the maximum contiguous
|
||||
* area in the chunk. This helps the allocator not to iterate the
|
||||
* chunk maps unnecessarily.
|
||||
*
|
||||
* Allocation state in each chunk is kept using an array of integers
|
||||
* on chunk->map. A positive value in the map represents a free
|
||||
* region and negative allocated. Allocation inside a chunk is done
|
||||
* by scanning this map sequentially and serving the first matching
|
||||
* entry. This is mostly copied from the percpu_modalloc() allocator.
|
||||
* Chunks are also linked into a rb tree to ease address to chunk
|
||||
* mapping during free.
|
||||
*
|
||||
* To use this allocator, arch code should do the followings.
|
||||
*
|
||||
* - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
|
||||
*
|
||||
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
|
||||
* regular address to percpu pointer and back
|
||||
*
|
||||
* - use pcpu_setup_first_chunk() during percpu area initialization to
|
||||
* setup the first chunk containing the kernel static percpu area
|
||||
*/
|
||||
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
|
||||
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
|
||||
|
||||
struct pcpu_chunk {
|
||||
struct list_head list; /* linked to pcpu_slot lists */
|
||||
struct rb_node rb_node; /* key is chunk->vm->addr */
|
||||
int free_size; /* free bytes in the chunk */
|
||||
int contig_hint; /* max contiguous size hint */
|
||||
struct vm_struct *vm; /* mapped vmalloc region */
|
||||
int map_used; /* # of map entries used */
|
||||
int map_alloc; /* # of map entries allocated */
|
||||
int *map; /* allocation map */
|
||||
bool immutable; /* no [de]population allowed */
|
||||
struct page *page[]; /* #cpus * UNIT_PAGES */
|
||||
};
|
||||
|
||||
static int pcpu_unit_pages __read_mostly;
|
||||
static int pcpu_unit_size __read_mostly;
|
||||
static int pcpu_chunk_size __read_mostly;
|
||||
static int pcpu_nr_slots __read_mostly;
|
||||
static size_t pcpu_chunk_struct_size __read_mostly;
|
||||
|
||||
/* the address of the first chunk which starts with the kernel static area */
|
||||
void *pcpu_base_addr __read_mostly;
|
||||
EXPORT_SYMBOL_GPL(pcpu_base_addr);
|
||||
|
||||
/* the size of kernel static area */
|
||||
static int pcpu_static_size __read_mostly;
|
||||
|
||||
/*
|
||||
* One mutex to rule them all.
|
||||
*
|
||||
* The following mutex is grabbed in the outermost public alloc/free
|
||||
* interface functions and released only when the operation is
|
||||
* complete. As such, every function in this file other than the
|
||||
* outermost functions are called under pcpu_mutex.
|
||||
*
|
||||
* It can easily be switched to use spinlock such that only the area
|
||||
* allocation and page population commit are protected with it doing
|
||||
* actual [de]allocation without holding any lock. However, given
|
||||
* what this allocator does, I think it's better to let them run
|
||||
* sequentially.
|
||||
*/
|
||||
static DEFINE_MUTEX(pcpu_mutex);
|
||||
|
||||
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
|
||||
static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
|
||||
|
||||
static int __pcpu_size_to_slot(int size)
|
||||
{
|
||||
int highbit = fls(size); /* size is in bytes */
|
||||
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
|
||||
}
|
||||
|
||||
static int pcpu_size_to_slot(int size)
|
||||
{
|
||||
if (size == pcpu_unit_size)
|
||||
return pcpu_nr_slots - 1;
|
||||
return __pcpu_size_to_slot(size);
|
||||
}
|
||||
|
||||
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
|
||||
{
|
||||
if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
|
||||
return 0;
|
||||
|
||||
return pcpu_size_to_slot(chunk->free_size);
|
||||
}
|
||||
|
||||
static int pcpu_page_idx(unsigned int cpu, int page_idx)
|
||||
{
|
||||
return cpu * pcpu_unit_pages + page_idx;
|
||||
}
|
||||
|
||||
static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
|
||||
unsigned int cpu, int page_idx)
|
||||
{
|
||||
return &chunk->page[pcpu_page_idx(cpu, page_idx)];
|
||||
}
|
||||
|
||||
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
|
||||
unsigned int cpu, int page_idx)
|
||||
{
|
||||
return (unsigned long)chunk->vm->addr +
|
||||
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
|
||||
int page_idx)
|
||||
{
|
||||
return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_realloc - versatile realloc
|
||||
* @p: the current pointer (can be NULL for new allocations)
|
||||
* @size: the current size in bytes (can be 0 for new allocations)
|
||||
* @new_size: the wanted new size in bytes (can be 0 for free)
|
||||
*
|
||||
* More robust realloc which can be used to allocate, resize or free a
|
||||
* memory area of arbitrary size. If the needed size goes over
|
||||
* PAGE_SIZE, kernel VM is used.
|
||||
*
|
||||
* RETURNS:
|
||||
* The new pointer on success, NULL on failure.
|
||||
*/
|
||||
static void *pcpu_realloc(void *p, size_t size, size_t new_size)
|
||||
{
|
||||
void *new;
|
||||
|
||||
if (new_size <= PAGE_SIZE)
|
||||
new = kmalloc(new_size, GFP_KERNEL);
|
||||
else
|
||||
new = vmalloc(new_size);
|
||||
if (new_size && !new)
|
||||
return NULL;
|
||||
|
||||
memcpy(new, p, min(size, new_size));
|
||||
if (new_size > size)
|
||||
memset(new + size, 0, new_size - size);
|
||||
|
||||
if (size <= PAGE_SIZE)
|
||||
kfree(p);
|
||||
else
|
||||
vfree(p);
|
||||
|
||||
return new;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
|
||||
* @chunk: chunk of interest
|
||||
* @oslot: the previous slot it was on
|
||||
*
|
||||
* This function is called after an allocation or free changed @chunk.
|
||||
* New slot according to the changed state is determined and @chunk is
|
||||
* moved to the slot.
|
||||
*/
|
||||
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
|
||||
{
|
||||
int nslot = pcpu_chunk_slot(chunk);
|
||||
|
||||
if (oslot != nslot) {
|
||||
if (oslot < nslot)
|
||||
list_move(&chunk->list, &pcpu_slot[nslot]);
|
||||
else
|
||||
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
|
||||
}
|
||||
}
|
||||
|
||||
static struct rb_node **pcpu_chunk_rb_search(void *addr,
|
||||
struct rb_node **parentp)
|
||||
{
|
||||
struct rb_node **p = &pcpu_addr_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct pcpu_chunk *chunk;
|
||||
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
|
||||
|
||||
if (addr < chunk->vm->addr)
|
||||
p = &(*p)->rb_left;
|
||||
else if (addr > chunk->vm->addr)
|
||||
p = &(*p)->rb_right;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
if (parentp)
|
||||
*parentp = parent;
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_chunk_addr_search - search for chunk containing specified address
|
||||
* @addr: address to search for
|
||||
*
|
||||
* Look for chunk which might contain @addr. More specifically, it
|
||||
* searchs for the chunk with the highest start address which isn't
|
||||
* beyond @addr.
|
||||
*
|
||||
* RETURNS:
|
||||
* The address of the found chunk.
|
||||
*/
|
||||
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
|
||||
{
|
||||
struct rb_node *n, *parent;
|
||||
struct pcpu_chunk *chunk;
|
||||
|
||||
n = *pcpu_chunk_rb_search(addr, &parent);
|
||||
if (!n) {
|
||||
/* no exactly matching chunk, the parent is the closest */
|
||||
n = parent;
|
||||
BUG_ON(!n);
|
||||
}
|
||||
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
|
||||
|
||||
if (addr < chunk->vm->addr) {
|
||||
/* the parent was the next one, look for the previous one */
|
||||
n = rb_prev(n);
|
||||
BUG_ON(!n);
|
||||
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
|
||||
}
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_chunk_addr_insert - insert chunk into address rb tree
|
||||
* @new: chunk to insert
|
||||
*
|
||||
* Insert @new into address rb tree.
|
||||
*/
|
||||
static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
|
||||
{
|
||||
struct rb_node **p, *parent;
|
||||
|
||||
p = pcpu_chunk_rb_search(new->vm->addr, &parent);
|
||||
BUG_ON(*p);
|
||||
rb_link_node(&new->rb_node, parent, p);
|
||||
rb_insert_color(&new->rb_node, &pcpu_addr_root);
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_split_block - split a map block
|
||||
* @chunk: chunk of interest
|
||||
* @i: index of map block to split
|
||||
* @head: head size in bytes (can be 0)
|
||||
* @tail: tail size in bytes (can be 0)
|
||||
*
|
||||
* Split the @i'th map block into two or three blocks. If @head is
|
||||
* non-zero, @head bytes block is inserted before block @i moving it
|
||||
* to @i+1 and reducing its size by @head bytes.
|
||||
*
|
||||
* If @tail is non-zero, the target block, which can be @i or @i+1
|
||||
* depending on @head, is reduced by @tail bytes and @tail byte block
|
||||
* is inserted after the target block.
|
||||
*
|
||||
* RETURNS:
|
||||
* 0 on success, -errno on failure.
|
||||
*/
|
||||
static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
|
||||
{
|
||||
int nr_extra = !!head + !!tail;
|
||||
int target = chunk->map_used + nr_extra;
|
||||
|
||||
/* reallocation required? */
|
||||
if (chunk->map_alloc < target) {
|
||||
int new_alloc = chunk->map_alloc;
|
||||
int *new;
|
||||
|
||||
while (new_alloc < target)
|
||||
new_alloc *= 2;
|
||||
|
||||
new = pcpu_realloc(chunk->map,
|
||||
chunk->map_alloc * sizeof(new[0]),
|
||||
new_alloc * sizeof(new[0]));
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
chunk->map_alloc = new_alloc;
|
||||
chunk->map = new;
|
||||
}
|
||||
|
||||
/* insert a new subblock */
|
||||
memmove(&chunk->map[i + nr_extra], &chunk->map[i],
|
||||
sizeof(chunk->map[0]) * (chunk->map_used - i));
|
||||
chunk->map_used += nr_extra;
|
||||
|
||||
if (head) {
|
||||
chunk->map[i + 1] = chunk->map[i] - head;
|
||||
chunk->map[i++] = head;
|
||||
}
|
||||
if (tail) {
|
||||
chunk->map[i++] -= tail;
|
||||
chunk->map[i] = tail;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_alloc_area - allocate area from a pcpu_chunk
|
||||
* @chunk: chunk of interest
|
||||
* @size: wanted size in bytes
|
||||
* @align: wanted align
|
||||
*
|
||||
* Try to allocate @size bytes area aligned at @align from @chunk.
|
||||
* Note that this function only allocates the offset. It doesn't
|
||||
* populate or map the area.
|
||||
*
|
||||
* RETURNS:
|
||||
* Allocated offset in @chunk on success, -errno on failure.
|
||||
*/
|
||||
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
|
||||
{
|
||||
int oslot = pcpu_chunk_slot(chunk);
|
||||
int max_contig = 0;
|
||||
int i, off;
|
||||
|
||||
/*
|
||||
* The static chunk initially doesn't have map attached
|
||||
* because kmalloc wasn't available during init. Give it one.
|
||||
*/
|
||||
if (unlikely(!chunk->map)) {
|
||||
chunk->map = pcpu_realloc(NULL, 0,
|
||||
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
|
||||
if (!chunk->map)
|
||||
return -ENOMEM;
|
||||
|
||||
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
|
||||
chunk->map[chunk->map_used++] = -pcpu_static_size;
|
||||
if (chunk->free_size)
|
||||
chunk->map[chunk->map_used++] = chunk->free_size;
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
|
||||
bool is_last = i + 1 == chunk->map_used;
|
||||
int head, tail;
|
||||
|
||||
/* extra for alignment requirement */
|
||||
head = ALIGN(off, align) - off;
|
||||
BUG_ON(i == 0 && head != 0);
|
||||
|
||||
if (chunk->map[i] < 0)
|
||||
continue;
|
||||
if (chunk->map[i] < head + size) {
|
||||
max_contig = max(chunk->map[i], max_contig);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If head is small or the previous block is free,
|
||||
* merge'em. Note that 'small' is defined as smaller
|
||||
* than sizeof(int), which is very small but isn't too
|
||||
* uncommon for percpu allocations.
|
||||
*/
|
||||
if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
|
||||
if (chunk->map[i - 1] > 0)
|
||||
chunk->map[i - 1] += head;
|
||||
else {
|
||||
chunk->map[i - 1] -= head;
|
||||
chunk->free_size -= head;
|
||||
}
|
||||
chunk->map[i] -= head;
|
||||
off += head;
|
||||
head = 0;
|
||||
}
|
||||
|
||||
/* if tail is small, just keep it around */
|
||||
tail = chunk->map[i] - head - size;
|
||||
if (tail < sizeof(int))
|
||||
tail = 0;
|
||||
|
||||
/* split if warranted */
|
||||
if (head || tail) {
|
||||
if (pcpu_split_block(chunk, i, head, tail))
|
||||
return -ENOMEM;
|
||||
if (head) {
|
||||
i++;
|
||||
off += head;
|
||||
max_contig = max(chunk->map[i - 1], max_contig);
|
||||
}
|
||||
if (tail)
|
||||
max_contig = max(chunk->map[i + 1], max_contig);
|
||||
}
|
||||
|
||||
/* update hint and mark allocated */
|
||||
if (is_last)
|
||||
chunk->contig_hint = max_contig; /* fully scanned */
|
||||
else
|
||||
chunk->contig_hint = max(chunk->contig_hint,
|
||||
max_contig);
|
||||
|
||||
chunk->free_size -= chunk->map[i];
|
||||
chunk->map[i] = -chunk->map[i];
|
||||
|
||||
pcpu_chunk_relocate(chunk, oslot);
|
||||
return off;
|
||||
}
|
||||
|
||||
chunk->contig_hint = max_contig; /* fully scanned */
|
||||
pcpu_chunk_relocate(chunk, oslot);
|
||||
|
||||
/*
|
||||
* Tell the upper layer that this chunk has no area left.
|
||||
* Note that this is not an error condition but a notification
|
||||
* to upper layer that it needs to look at other chunks.
|
||||
* -ENOSPC is chosen as it isn't used in memory subsystem and
|
||||
* matches the meaning in a way.
|
||||
*/
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_free_area - free area to a pcpu_chunk
|
||||
* @chunk: chunk of interest
|
||||
* @freeme: offset of area to free
|
||||
*
|
||||
* Free area starting from @freeme to @chunk. Note that this function
|
||||
* only modifies the allocation map. It doesn't depopulate or unmap
|
||||
* the area.
|
||||
*/
|
||||
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
|
||||
{
|
||||
int oslot = pcpu_chunk_slot(chunk);
|
||||
int i, off;
|
||||
|
||||
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
|
||||
if (off == freeme)
|
||||
break;
|
||||
BUG_ON(off != freeme);
|
||||
BUG_ON(chunk->map[i] > 0);
|
||||
|
||||
chunk->map[i] = -chunk->map[i];
|
||||
chunk->free_size += chunk->map[i];
|
||||
|
||||
/* merge with previous? */
|
||||
if (i > 0 && chunk->map[i - 1] >= 0) {
|
||||
chunk->map[i - 1] += chunk->map[i];
|
||||
chunk->map_used--;
|
||||
memmove(&chunk->map[i], &chunk->map[i + 1],
|
||||
(chunk->map_used - i) * sizeof(chunk->map[0]));
|
||||
i--;
|
||||
}
|
||||
/* merge with next? */
|
||||
if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
|
||||
chunk->map[i] += chunk->map[i + 1];
|
||||
chunk->map_used--;
|
||||
memmove(&chunk->map[i + 1], &chunk->map[i + 2],
|
||||
(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
|
||||
}
|
||||
|
||||
chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
|
||||
pcpu_chunk_relocate(chunk, oslot);
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_unmap - unmap pages out of a pcpu_chunk
|
||||
* @chunk: chunk of interest
|
||||
* @page_start: page index of the first page to unmap
|
||||
* @page_end: page index of the last page to unmap + 1
|
||||
* @flush: whether to flush cache and tlb or not
|
||||
*
|
||||
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
|
||||
* If @flush is true, vcache is flushed before unmapping and tlb
|
||||
* after.
|
||||
*/
|
||||
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
|
||||
bool flush)
|
||||
{
|
||||
unsigned int last = num_possible_cpus() - 1;
|
||||
unsigned int cpu;
|
||||
|
||||
/* unmap must not be done on immutable chunk */
|
||||
WARN_ON(chunk->immutable);
|
||||
|
||||
/*
|
||||
* Each flushing trial can be very expensive, issue flush on
|
||||
* the whole region at once rather than doing it for each cpu.
|
||||
* This could be an overkill but is more scalable.
|
||||
*/
|
||||
if (flush)
|
||||
flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
|
||||
pcpu_chunk_addr(chunk, last, page_end));
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
unmap_kernel_range_noflush(
|
||||
pcpu_chunk_addr(chunk, cpu, page_start),
|
||||
(page_end - page_start) << PAGE_SHIFT);
|
||||
|
||||
/* ditto as flush_cache_vunmap() */
|
||||
if (flush)
|
||||
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
|
||||
pcpu_chunk_addr(chunk, last, page_end));
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
|
||||
* @chunk: chunk to depopulate
|
||||
* @off: offset to the area to depopulate
|
||||
* @size: size of the area to depopulate in bytes
|
||||
* @flush: whether to flush cache and tlb or not
|
||||
*
|
||||
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
|
||||
* from @chunk. If @flush is true, vcache is flushed before unmapping
|
||||
* and tlb after.
|
||||
*/
|
||||
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
|
||||
bool flush)
|
||||
{
|
||||
int page_start = PFN_DOWN(off);
|
||||
int page_end = PFN_UP(off + size);
|
||||
int unmap_start = -1;
|
||||
int uninitialized_var(unmap_end);
|
||||
unsigned int cpu;
|
||||
int i;
|
||||
|
||||
for (i = page_start; i < page_end; i++) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
|
||||
|
||||
if (!*pagep)
|
||||
continue;
|
||||
|
||||
__free_page(*pagep);
|
||||
|
||||
/*
|
||||
* If it's partial depopulation, it might get
|
||||
* populated or depopulated again. Mark the
|
||||
* page gone.
|
||||
*/
|
||||
*pagep = NULL;
|
||||
|
||||
unmap_start = unmap_start < 0 ? i : unmap_start;
|
||||
unmap_end = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (unmap_start >= 0)
|
||||
pcpu_unmap(chunk, unmap_start, unmap_end, flush);
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_map - map pages into a pcpu_chunk
|
||||
* @chunk: chunk of interest
|
||||
* @page_start: page index of the first page to map
|
||||
* @page_end: page index of the last page to map + 1
|
||||
*
|
||||
* For each cpu, map pages [@page_start,@page_end) into @chunk.
|
||||
* vcache is flushed afterwards.
|
||||
*/
|
||||
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
|
||||
{
|
||||
unsigned int last = num_possible_cpus() - 1;
|
||||
unsigned int cpu;
|
||||
int err;
|
||||
|
||||
/* map must not be done on immutable chunk */
|
||||
WARN_ON(chunk->immutable);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
err = map_kernel_range_noflush(
|
||||
pcpu_chunk_addr(chunk, cpu, page_start),
|
||||
(page_end - page_start) << PAGE_SHIFT,
|
||||
PAGE_KERNEL,
|
||||
pcpu_chunk_pagep(chunk, cpu, page_start));
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* flush at once, please read comments in pcpu_unmap() */
|
||||
flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
|
||||
pcpu_chunk_addr(chunk, last, page_end));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
|
||||
* @chunk: chunk of interest
|
||||
* @off: offset to the area to populate
|
||||
* @size: size of the area to populate in bytes
|
||||
*
|
||||
* For each cpu, populate and map pages [@page_start,@page_end) into
|
||||
* @chunk. The area is cleared on return.
|
||||
*/
|
||||
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
|
||||
{
|
||||
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
|
||||
int page_start = PFN_DOWN(off);
|
||||
int page_end = PFN_UP(off + size);
|
||||
int map_start = -1;
|
||||
int map_end;
|
||||
unsigned int cpu;
|
||||
int i;
|
||||
|
||||
for (i = page_start; i < page_end; i++) {
|
||||
if (pcpu_chunk_page_occupied(chunk, i)) {
|
||||
if (map_start >= 0) {
|
||||
if (pcpu_map(chunk, map_start, map_end))
|
||||
goto err;
|
||||
map_start = -1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
map_start = map_start < 0 ? i : map_start;
|
||||
map_end = i + 1;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
|
||||
|
||||
*pagep = alloc_pages_node(cpu_to_node(cpu),
|
||||
alloc_mask, 0);
|
||||
if (!*pagep)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
|
||||
goto err;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
|
||||
size);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
/* likely under heavy memory pressure, give memory back */
|
||||
pcpu_depopulate_chunk(chunk, off, size, true);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
|
||||
{
|
||||
if (!chunk)
|
||||
return;
|
||||
if (chunk->vm)
|
||||
free_vm_area(chunk->vm);
|
||||
pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
|
||||
kfree(chunk);
|
||||
}
|
||||
|
||||
static struct pcpu_chunk *alloc_pcpu_chunk(void)
|
||||
{
|
||||
struct pcpu_chunk *chunk;
|
||||
|
||||
chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
|
||||
if (!chunk)
|
||||
return NULL;
|
||||
|
||||
chunk->map = pcpu_realloc(NULL, 0,
|
||||
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
|
||||
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
|
||||
chunk->map[chunk->map_used++] = pcpu_unit_size;
|
||||
|
||||
chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
|
||||
if (!chunk->vm) {
|
||||
free_pcpu_chunk(chunk);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&chunk->list);
|
||||
chunk->free_size = pcpu_unit_size;
|
||||
chunk->contig_hint = pcpu_unit_size;
|
||||
|
||||
return chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* __alloc_percpu - allocate percpu area
|
||||
* @size: size of area to allocate in bytes
|
||||
* @align: alignment of area (max PAGE_SIZE)
|
||||
*
|
||||
* Allocate percpu area of @size bytes aligned at @align. Might
|
||||
* sleep. Might trigger writeouts.
|
||||
*
|
||||
* RETURNS:
|
||||
* Percpu pointer to the allocated area on success, NULL on failure.
|
||||
*/
|
||||
void *__alloc_percpu(size_t size, size_t align)
|
||||
{
|
||||
void *ptr = NULL;
|
||||
struct pcpu_chunk *chunk;
|
||||
int slot, off;
|
||||
|
||||
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
|
||||
WARN(true, "illegal size (%zu) or align (%zu) for "
|
||||
"percpu allocation\n", size, align);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mutex_lock(&pcpu_mutex);
|
||||
|
||||
/* allocate area */
|
||||
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
|
||||
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
|
||||
if (size > chunk->contig_hint)
|
||||
continue;
|
||||
off = pcpu_alloc_area(chunk, size, align);
|
||||
if (off >= 0)
|
||||
goto area_found;
|
||||
if (off != -ENOSPC)
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
/* hmmm... no space left, create a new chunk */
|
||||
chunk = alloc_pcpu_chunk();
|
||||
if (!chunk)
|
||||
goto out_unlock;
|
||||
pcpu_chunk_relocate(chunk, -1);
|
||||
pcpu_chunk_addr_insert(chunk);
|
||||
|
||||
off = pcpu_alloc_area(chunk, size, align);
|
||||
if (off < 0)
|
||||
goto out_unlock;
|
||||
|
||||
area_found:
|
||||
/* populate, map and clear the area */
|
||||
if (pcpu_populate_chunk(chunk, off, size)) {
|
||||
pcpu_free_area(chunk, off);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
|
||||
out_unlock:
|
||||
mutex_unlock(&pcpu_mutex);
|
||||
return ptr;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__alloc_percpu);
|
||||
|
||||
static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
|
||||
{
|
||||
WARN_ON(chunk->immutable);
|
||||
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
|
||||
list_del(&chunk->list);
|
||||
rb_erase(&chunk->rb_node, &pcpu_addr_root);
|
||||
free_pcpu_chunk(chunk);
|
||||
}
|
||||
|
||||
/**
|
||||
* free_percpu - free percpu area
|
||||
* @ptr: pointer to area to free
|
||||
*
|
||||
* Free percpu area @ptr. Might sleep.
|
||||
*/
|
||||
void free_percpu(void *ptr)
|
||||
{
|
||||
void *addr = __pcpu_ptr_to_addr(ptr);
|
||||
struct pcpu_chunk *chunk;
|
||||
int off;
|
||||
|
||||
if (!ptr)
|
||||
return;
|
||||
|
||||
mutex_lock(&pcpu_mutex);
|
||||
|
||||
chunk = pcpu_chunk_addr_search(addr);
|
||||
off = addr - chunk->vm->addr;
|
||||
|
||||
pcpu_free_area(chunk, off);
|
||||
|
||||
/* the chunk became fully free, kill one if there are other free ones */
|
||||
if (chunk->free_size == pcpu_unit_size) {
|
||||
struct pcpu_chunk *pos;
|
||||
|
||||
list_for_each_entry(pos,
|
||||
&pcpu_slot[pcpu_chunk_slot(chunk)], list)
|
||||
if (pos != chunk) {
|
||||
pcpu_kill_chunk(pos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&pcpu_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(free_percpu);
|
||||
|
||||
/**
|
||||
* pcpu_setup_first_chunk - initialize the first percpu chunk
|
||||
* @get_page_fn: callback to fetch page pointer
|
||||
* @static_size: the size of static percpu area in bytes
|
||||
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
|
||||
* @free_size: free size in bytes, 0 for auto
|
||||
* @base_addr: mapped address, NULL for auto
|
||||
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
|
||||
*
|
||||
* Initialize the first percpu chunk which contains the kernel static
|
||||
* perpcu area. This function is to be called from arch percpu area
|
||||
* setup path. The first two parameters are mandatory. The rest are
|
||||
* optional.
|
||||
*
|
||||
* @get_page_fn() should return pointer to percpu page given cpu
|
||||
* number and page number. It should at least return enough pages to
|
||||
* cover the static area. The returned pages for static area should
|
||||
* have been initialized with valid data. If @unit_size is specified,
|
||||
* it can also return pages after the static area. NULL return
|
||||
* indicates end of pages for the cpu. Note that @get_page_fn() must
|
||||
* return the same number of pages for all cpus.
|
||||
*
|
||||
* @unit_size, if non-zero, determines unit size and must be aligned
|
||||
* to PAGE_SIZE and equal to or larger than @static_size + @free_size.
|
||||
*
|
||||
* @free_size determines the number of free bytes after the static
|
||||
* area in the first chunk. If zero, whatever left is available.
|
||||
* Specifying non-zero value make percpu leave the area after
|
||||
* @static_size + @free_size alone.
|
||||
*
|
||||
* Non-null @base_addr means that the caller already allocated virtual
|
||||
* region for the first chunk and mapped it. percpu must not mess
|
||||
* with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
|
||||
* @populate_pte_fn doesn't make any sense.
|
||||
*
|
||||
* @populate_pte_fn is used to populate the pagetable. NULL means the
|
||||
* caller already populated the pagetable.
|
||||
*
|
||||
* RETURNS:
|
||||
* The determined pcpu_unit_size which can be used to initialize
|
||||
* percpu access.
|
||||
*/
|
||||
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
|
||||
size_t static_size, size_t unit_size,
|
||||
size_t free_size, void *base_addr,
|
||||
pcpu_populate_pte_fn_t populate_pte_fn)
|
||||
{
|
||||
static struct vm_struct static_vm;
|
||||
struct pcpu_chunk *static_chunk;
|
||||
unsigned int cpu;
|
||||
int nr_pages;
|
||||
int err, i;
|
||||
|
||||
/* santiy checks */
|
||||
BUG_ON(!static_size);
|
||||
BUG_ON(!unit_size && free_size);
|
||||
BUG_ON(unit_size && unit_size < static_size + free_size);
|
||||
BUG_ON(unit_size & ~PAGE_MASK);
|
||||
BUG_ON(base_addr && !unit_size);
|
||||
BUG_ON(base_addr && populate_pte_fn);
|
||||
|
||||
if (unit_size)
|
||||
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
|
||||
else
|
||||
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
|
||||
PFN_UP(static_size));
|
||||
|
||||
pcpu_static_size = static_size;
|
||||
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
|
||||
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
|
||||
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
|
||||
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
|
||||
|
||||
/*
|
||||
* Allocate chunk slots. The additional last slot is for
|
||||
* empty chunks.
|
||||
*/
|
||||
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
|
||||
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
|
||||
for (i = 0; i < pcpu_nr_slots; i++)
|
||||
INIT_LIST_HEAD(&pcpu_slot[i]);
|
||||
|
||||
/* init static_chunk */
|
||||
static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
|
||||
INIT_LIST_HEAD(&static_chunk->list);
|
||||
static_chunk->vm = &static_vm;
|
||||
|
||||
if (free_size)
|
||||
static_chunk->free_size = free_size;
|
||||
else
|
||||
static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
|
||||
|
||||
static_chunk->contig_hint = static_chunk->free_size;
|
||||
|
||||
/* allocate vm address */
|
||||
static_vm.flags = VM_ALLOC;
|
||||
static_vm.size = pcpu_chunk_size;
|
||||
|
||||
if (!base_addr)
|
||||
vm_area_register_early(&static_vm, PAGE_SIZE);
|
||||
else {
|
||||
/*
|
||||
* Pages already mapped. No need to remap into
|
||||
* vmalloc area. In this case the static chunk can't
|
||||
* be mapped or unmapped by percpu and is marked
|
||||
* immutable.
|
||||
*/
|
||||
static_vm.addr = base_addr;
|
||||
static_chunk->immutable = true;
|
||||
}
|
||||
|
||||
/* assign pages */
|
||||
nr_pages = -1;
|
||||
for_each_possible_cpu(cpu) {
|
||||
for (i = 0; i < pcpu_unit_pages; i++) {
|
||||
struct page *page = get_page_fn(cpu, i);
|
||||
|
||||
if (!page)
|
||||
break;
|
||||
*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
|
||||
}
|
||||
|
||||
BUG_ON(i < PFN_UP(pcpu_static_size));
|
||||
|
||||
if (nr_pages < 0)
|
||||
nr_pages = i;
|
||||
else
|
||||
BUG_ON(nr_pages != i);
|
||||
}
|
||||
|
||||
/* map them */
|
||||
if (populate_pte_fn) {
|
||||
for_each_possible_cpu(cpu)
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
populate_pte_fn(pcpu_chunk_addr(static_chunk,
|
||||
cpu, i));
|
||||
|
||||
err = pcpu_map(static_chunk, 0, nr_pages);
|
||||
if (err)
|
||||
panic("failed to setup static percpu area, err=%d\n",
|
||||
err);
|
||||
}
|
||||
|
||||
/* link static_chunk in */
|
||||
pcpu_chunk_relocate(static_chunk, -1);
|
||||
pcpu_chunk_addr_insert(static_chunk);
|
||||
|
||||
/* we're done */
|
||||
pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
|
||||
return pcpu_unit_size;
|
||||
}
|
94
mm/vmalloc.c
94
mm/vmalloc.c
|
@ -24,6 +24,7 @@
|
|||
#include <linux/radix-tree.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/pfn.h>
|
||||
|
||||
#include <asm/atomic.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
|
|||
*
|
||||
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
|
||||
*/
|
||||
static int vmap_page_range(unsigned long start, unsigned long end,
|
||||
pgprot_t prot, struct page **pages)
|
||||
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
|
||||
pgprot_t prot, struct page **pages)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
|
@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
|
|||
if (err)
|
||||
break;
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
flush_cache_vmap(start, end);
|
||||
|
||||
if (unlikely(err))
|
||||
return err;
|
||||
return nr;
|
||||
}
|
||||
|
||||
static int vmap_page_range(unsigned long start, unsigned long end,
|
||||
pgprot_t prot, struct page **pages)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = vmap_page_range_noflush(start, end, prot, pages);
|
||||
flush_cache_vmap(start, end);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int is_vmalloc_or_module_addr(const void *x)
|
||||
{
|
||||
/*
|
||||
|
@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
|
|||
}
|
||||
EXPORT_SYMBOL(vm_map_ram);
|
||||
|
||||
/**
|
||||
* vm_area_register_early - register vmap area early during boot
|
||||
* @vm: vm_struct to register
|
||||
* @align: requested alignment
|
||||
*
|
||||
* This function is used to register kernel vm area before
|
||||
* vmalloc_init() is called. @vm->size and @vm->flags should contain
|
||||
* proper values on entry and other fields should be zero. On return,
|
||||
* vm->addr contains the allocated address.
|
||||
*
|
||||
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
|
||||
*/
|
||||
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
|
||||
{
|
||||
static size_t vm_init_off __initdata;
|
||||
unsigned long addr;
|
||||
|
||||
addr = ALIGN(VMALLOC_START + vm_init_off, align);
|
||||
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
|
||||
|
||||
vm->addr = (void *)addr;
|
||||
|
||||
vm->next = vmlist;
|
||||
vmlist = vm;
|
||||
}
|
||||
|
||||
void __init vmalloc_init(void)
|
||||
{
|
||||
struct vmap_area *va;
|
||||
|
@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
|
|||
vmap_initialized = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* map_kernel_range_noflush - map kernel VM area with the specified pages
|
||||
* @addr: start of the VM area to map
|
||||
* @size: size of the VM area to map
|
||||
* @prot: page protection flags to use
|
||||
* @pages: pages to map
|
||||
*
|
||||
* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
|
||||
* specify should have been allocated using get_vm_area() and its
|
||||
* friends.
|
||||
*
|
||||
* NOTE:
|
||||
* This function does NOT do any cache flushing. The caller is
|
||||
* responsible for calling flush_cache_vmap() on to-be-mapped areas
|
||||
* before calling this function.
|
||||
*
|
||||
* RETURNS:
|
||||
* The number of pages mapped on success, -errno on failure.
|
||||
*/
|
||||
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
|
||||
pgprot_t prot, struct page **pages)
|
||||
{
|
||||
return vmap_page_range_noflush(addr, addr + size, prot, pages);
|
||||
}
|
||||
|
||||
/**
|
||||
* unmap_kernel_range_noflush - unmap kernel VM area
|
||||
* @addr: start of the VM area to unmap
|
||||
* @size: size of the VM area to unmap
|
||||
*
|
||||
* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
|
||||
* specify should have been allocated using get_vm_area() and its
|
||||
* friends.
|
||||
*
|
||||
* NOTE:
|
||||
* This function does NOT do any cache flushing. The caller is
|
||||
* responsible for calling flush_cache_vunmap() on to-be-mapped areas
|
||||
* before calling this function and flush_tlb_kernel_range() after.
|
||||
*/
|
||||
void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
|
||||
{
|
||||
vunmap_page_range(addr, addr + size);
|
||||
}
|
||||
|
||||
/**
|
||||
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
|
||||
* @addr: start of the VM area to unmap
|
||||
* @size: size of the VM area to unmap
|
||||
*
|
||||
* Similar to unmap_kernel_range_noflush() but flushes vcache before
|
||||
* the unmapping and tlb after.
|
||||
*/
|
||||
void unmap_kernel_range(unsigned long addr, unsigned long size)
|
||||
{
|
||||
unsigned long end = addr + size;
|
||||
|
|
|
@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
|
|||
int snmp_mib_init(void *ptr[2], size_t mibsize)
|
||||
{
|
||||
BUG_ON(ptr == NULL);
|
||||
ptr[0] = __alloc_percpu(mibsize);
|
||||
ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
|
||||
if (!ptr[0])
|
||||
goto err0;
|
||||
ptr[1] = __alloc_percpu(mibsize);
|
||||
ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
|
||||
if (!ptr[1])
|
||||
goto err1;
|
||||
return 0;
|
||||
|
|
Загрузка…
Ссылка в новой задаче