diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index f3fd528ead3b..54136fd00202 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -44,150 +44,7 @@ struct early_node_data { }; static struct early_node_data mem_data[MAX_NUMNODES] __initdata; - -/** - * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node - * - * This function will move nodes with only CPUs (no memory) - * to a node with memory which is at the minimum numa_slit distance. - * Any reassigments will result in the compression of the nodes - * and renumbering the nid values where appropriate. - * The static declarations below are to avoid large stack size which - * makes the code not re-entrant. - */ -static void __init reassign_cpu_only_nodes(void) -{ - struct node_memblk_s *p; - int i, j, k, nnode, nid, cpu, cpunid, pxm; - u8 cslit, slit; - static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata; - static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata; - static int node_flip[MAX_NUMNODES] __initdata; - static int old_nid_map[NR_CPUS] __initdata; - - for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) - if (!test_bit(p->nid, (void *) nodes_with_mem)) { - set_bit(p->nid, (void *) nodes_with_mem); - nnode++; - } - - /* - * All nids with memory. - */ - if (nnode == num_online_nodes()) - return; - - /* - * Change nids and attempt to migrate CPU-only nodes - * to the best numa_slit (closest neighbor) possible. - * For reassigned CPU nodes a nid can't be arrived at - * until after this loop because the target nid's new - * identity might not have been established yet. So - * new nid values are fabricated above num_online_nodes() and - * mapped back later to their true value. - */ - /* MCD - This code is a bit complicated, but may be unnecessary now. - * We can now handle much more interesting node-numbering. - * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES - * and that there be no holes in the numbering 0..numnodes - * has become simply 0 <= nid <= MAX_NUMNODES. - */ - nid = 0; - for_each_online_node(i) { - if (test_bit(i, (void *) nodes_with_mem)) { - /* - * Save original nid value for numa_slit - * fixup and node_cpuid reassignments. - */ - node_flip[nid] = i; - - if (i == nid) { - nid++; - continue; - } - - for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++) - if (p->nid == i) - p->nid = nid; - - cpunid = nid; - nid++; - } else - cpunid = MAX_NUMNODES; - - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node_cpuid[cpu].nid == i) { - /* - * For nodes not being reassigned just - * fix the cpu's nid and reverse pxm map - */ - if (cpunid < MAX_NUMNODES) { - pxm = nid_to_pxm_map[i]; - pxm_to_nid_map[pxm] = - node_cpuid[cpu].nid = cpunid; - continue; - } - - /* - * For nodes being reassigned, find best node by - * numa_slit information and then make a temporary - * nid value based on current nid and num_online_nodes(). - */ - slit = 0xff; - k = 2*num_online_nodes(); - for_each_online_node(j) { - if (i == j) - continue; - else if (test_bit(j, (void *) nodes_with_mem)) { - cslit = numa_slit[i * num_online_nodes() + j]; - if (cslit < slit) { - k = num_online_nodes() + j; - slit = cslit; - } - } - } - - /* save old nid map so we can update the pxm */ - old_nid_map[cpu] = node_cpuid[cpu].nid; - node_cpuid[cpu].nid = k; - } - } - - /* - * Fixup temporary nid values for CPU-only nodes. - */ - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node_cpuid[cpu].nid == (2*num_online_nodes())) { - pxm = nid_to_pxm_map[old_nid_map[cpu]]; - pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1; - } else { - for (i = 0; i < nnode; i++) { - if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes())) - continue; - - pxm = nid_to_pxm_map[old_nid_map[cpu]]; - pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i; - break; - } - } - - /* - * Fix numa_slit by compressing from larger - * nid array to reduced nid array. - */ - for (i = 0; i < nnode; i++) - for (j = 0; j < nnode; j++) - numa_slit_fix[i * nnode + j] = - numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]]; - - memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit)); - - nodes_clear(node_online_map); - for (i = 0; i < nnode; i++) - node_set_online(i); - - return; -} +static nodemask_t memory_less_mask __initdata; /* * To prevent cache aliasing effects, align per-node structures so that they @@ -232,28 +89,6 @@ static int __init build_node_maps(unsigned long start, unsigned long len, return 0; } -/** - * early_nr_phys_cpus_node - return number of physical cpus on a given node - * @node: node to check - * - * Count the number of physical cpus on @node. These are cpus that actually - * exist. We can't use nr_cpus_node() yet because - * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been - * called yet. - */ -static int early_nr_phys_cpus_node(int node) -{ - int cpu, n = 0; - - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (node == node_cpuid[cpu].nid) - if ((cpu == 0) || node_cpuid[cpu].phys_id) - n++; - - return n; -} - - /** * early_nr_cpus_node - return number of cpus on a given node * @node: node to check @@ -262,7 +97,7 @@ static int early_nr_phys_cpus_node(int node) * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been * called yet. Note that node 0 will also count all non-existent cpus. */ -static int early_nr_cpus_node(int node) +static int __init early_nr_cpus_node(int node) { int cpu, n = 0; @@ -273,6 +108,70 @@ static int early_nr_cpus_node(int node) return n; } +/** + * compute_pernodesize - compute size of pernode data + * @node: the node id. + */ +static unsigned long __init compute_pernodesize(int node) +{ + unsigned long pernodesize = 0, cpus; + + cpus = early_nr_cpus_node(node); + pernodesize += PERCPU_PAGE_SIZE * cpus; + pernodesize += node * L1_CACHE_BYTES; + pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); + pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + pernodesize = PAGE_ALIGN(pernodesize); + return pernodesize; +} + +/** + * fill_pernode - initialize pernode data. + * @node: the node id. + * @pernode: physical address of pernode data + * @pernodesize: size of the pernode data + */ +static void __init fill_pernode(int node, unsigned long pernode, + unsigned long pernodesize) +{ + void *cpu_data; + int cpus = early_nr_cpus_node(node), cpu; + struct bootmem_data *bdp = &mem_data[node].bootmem_data; + + mem_data[node].pernode_addr = pernode; + mem_data[node].pernode_size = pernodesize; + memset(__va(pernode), 0, pernodesize); + + cpu_data = (void *)pernode; + pernode += PERCPU_PAGE_SIZE * cpus; + pernode += node * L1_CACHE_BYTES; + + mem_data[node].pgdat = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + mem_data[node].node_data = __va(pernode); + pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + + mem_data[node].pgdat->bdata = bdp; + pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); + + /* + * Copy the static per-cpu data into the region we + * just set aside and then setup __per_cpu_offset + * for each CPU on this node. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (node == node_cpuid[cpu].nid) { + memcpy(__va(cpu_data), __phys_per_cpu_start, + __per_cpu_end - __per_cpu_start); + __per_cpu_offset[cpu] = (char*)__va(cpu_data) - + __per_cpu_start; + cpu_data += PERCPU_PAGE_SIZE; + } + } + + return; +} /** * find_pernode_space - allocate memory for memory map and per-node structures * @start: physical start of range @@ -304,9 +203,8 @@ static int early_nr_cpus_node(int node) static int __init find_pernode_space(unsigned long start, unsigned long len, int node) { - unsigned long epfn, cpu, cpus, phys_cpus; + unsigned long epfn; unsigned long pernodesize = 0, pernode, pages, mapsize; - void *cpu_data; struct bootmem_data *bdp = &mem_data[node].bootmem_data; epfn = (start + len) >> PAGE_SHIFT; @@ -329,49 +227,12 @@ static int __init find_pernode_space(unsigned long start, unsigned long len, * Calculate total size needed, incl. what's necessary * for good alignment and alias prevention. */ - cpus = early_nr_cpus_node(node); - phys_cpus = early_nr_phys_cpus_node(node); - pernodesize += PERCPU_PAGE_SIZE * cpus; - pernodesize += node * L1_CACHE_BYTES; - pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); - pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); - pernodesize = PAGE_ALIGN(pernodesize); + pernodesize = compute_pernodesize(node); pernode = NODEDATA_ALIGN(start, node); /* Is this range big enough for what we want to store here? */ - if (start + len > (pernode + pernodesize + mapsize)) { - mem_data[node].pernode_addr = pernode; - mem_data[node].pernode_size = pernodesize; - memset(__va(pernode), 0, pernodesize); - - cpu_data = (void *)pernode; - pernode += PERCPU_PAGE_SIZE * cpus; - pernode += node * L1_CACHE_BYTES; - - mem_data[node].pgdat = __va(pernode); - pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); - - mem_data[node].node_data = __va(pernode); - pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); - - mem_data[node].pgdat->bdata = bdp; - pernode += L1_CACHE_ALIGN(sizeof(pg_data_t)); - - /* - * Copy the static per-cpu data into the region we - * just set aside and then setup __per_cpu_offset - * for each CPU on this node. - */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (node == node_cpuid[cpu].nid) { - memcpy(__va(cpu_data), __phys_per_cpu_start, - __per_cpu_end - __per_cpu_start); - __per_cpu_offset[cpu] = (char*)__va(cpu_data) - - __per_cpu_start; - cpu_data += PERCPU_PAGE_SIZE; - } - } - } + if (start + len > (pernode + pernodesize + mapsize)) + fill_pernode(node, pernode, pernodesize); return 0; } @@ -411,6 +272,9 @@ static void __init reserve_pernode_space(void) for_each_online_node(node) { pg_data_t *pdp = mem_data[node].pgdat; + if (node_isset(node, memory_less_mask)) + continue; + bdp = pdp->bdata; /* First the bootmem_map itself */ @@ -455,6 +319,83 @@ static void __init initialize_pernode_data(void) } } +/** + * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit + * node but fall back to any other node when __alloc_bootmem_node fails + * for best. + * @nid: node id + * @pernodesize: size of this node's pernode data + * @align: alignment to use for this node's pernode data + */ +static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize, + unsigned long align) +{ + void *ptr = NULL; + u8 best = 0xff; + int bestnode = -1, node; + + for_each_online_node(node) { + if (node_isset(node, memory_less_mask)) + continue; + else if (node_distance(nid, node) < best) { + best = node_distance(nid, node); + bestnode = node; + } + } + + ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, + pernodesize, align, __pa(MAX_DMA_ADDRESS)); + + if (!ptr) + panic("NO memory for memory less node\n"); + return ptr; +} + +/** + * pgdat_insert - insert the pgdat into global pgdat_list + * @pgdat: the pgdat for a node. + */ +static void __init pgdat_insert(pg_data_t *pgdat) +{ + pg_data_t *prev = NULL, *next; + + for_each_pgdat(next) + if (pgdat->node_id < next->node_id) + break; + else + prev = next; + + if (prev) { + prev->pgdat_next = pgdat; + pgdat->pgdat_next = next; + } else { + pgdat->pgdat_next = pgdat_list; + pgdat_list = pgdat; + } + + return; +} + +/** + * memory_less_nodes - allocate and initialize CPU only nodes pernode + * information. + */ +static void __init memory_less_nodes(void) +{ + unsigned long pernodesize; + void *pernode; + int node; + + for_each_node_mask(node, memory_less_mask) { + pernodesize = compute_pernodesize(node); + pernode = memory_less_node_alloc(node, pernodesize, + (node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024)); + fill_pernode(node, __pa(pernode), pernodesize); + } + + return; +} + /** * find_memory - walk the EFI memory map and setup the bootmem allocator * @@ -472,16 +413,19 @@ void __init find_memory(void) node_set_online(0); } + nodes_or(memory_less_mask, memory_less_mask, node_online_map); min_low_pfn = -1; max_low_pfn = 0; - if (num_online_nodes() > 1) - reassign_cpu_only_nodes(); - /* These actually end up getting called by call_pernode_memory() */ efi_memmap_walk(filter_rsvd_memory, build_node_maps); efi_memmap_walk(filter_rsvd_memory, find_pernode_space); + for_each_online_node(node) + if (mem_data[node].bootmem_data.node_low_pfn) { + node_clear(node, memory_less_mask); + mem_data[node].min_pfn = ~0UL; + } /* * Initialize the boot memory maps in reverse order since that's * what the bootmem allocator expects @@ -492,17 +436,14 @@ void __init find_memory(void) if (!node_online(node)) continue; + else if (node_isset(node, memory_less_mask)) + continue; bdp = &mem_data[node].bootmem_data; pernode = mem_data[node].pernode_addr; pernodesize = mem_data[node].pernode_size; map = pernode + pernodesize; - /* Sanity check... */ - if (!pernode) - panic("pernode space for node %d " - "could not be allocated!", node); - init_bootmem_node(mem_data[node].pgdat, map>>PAGE_SHIFT, bdp->node_boot_start>>PAGE_SHIFT, @@ -512,6 +453,7 @@ void __init find_memory(void) efi_memmap_walk(filter_rsvd_memory, free_node_bootmem); reserve_pernode_space(); + memory_less_nodes(); initialize_pernode_data(); max_pfn = max_low_pfn; @@ -680,12 +622,13 @@ void __init paging_init(void) max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; - /* so min() will work in count_node_pages */ - for_each_online_node(node) - mem_data[node].min_pfn = ~0UL; - efi_memmap_walk(filter_rsvd_memory, count_node_pages); + vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); + vmem_map = (struct page *) vmalloc_end; + efi_memmap_walk(create_mem_map_page_table, NULL); + printk("Virtual mem_map starts at 0x%p\n", vmem_map); + for_each_online_node(node) { memset(zones_size, 0, sizeof(zones_size)); memset(zholes_size, 0, sizeof(zholes_size)); @@ -719,15 +662,6 @@ void __init paging_init(void) mem_data[node].num_dma_physpages); } - if (node == 0) { - vmalloc_end -= - PAGE_ALIGN(max_low_pfn * sizeof(struct page)); - vmem_map = (struct page *) vmalloc_end; - - efi_memmap_walk(create_mem_map_page_table, NULL); - printk("Virtual mem_map starts at 0x%p\n", vmem_map); - } - pfn_offset = mem_data[node].min_pfn; NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; @@ -735,5 +669,11 @@ void __init paging_init(void) pfn_offset, zholes_size); } + /* + * Make memory less nodes become a member of the known nodes. + */ + for_each_node_mask(node, memory_less_mask) + pgdat_insert(mem_data[node].pgdat); + zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 4eb2f52b87a1..65f9958db9f0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -597,7 +597,8 @@ mem_init (void) kclist_add(&kcore_kernel, _stext, _end - _stext); for_each_pgdat(pgdat) - totalram_pages += free_all_bootmem_node(pgdat); + if (pgdat->bdata->node_bootmem_map) + totalram_pages += free_all_bootmem_node(pgdat); reserved_pages = 0; efi_memmap_walk(count_reserved_pages, &reserved_pages);