diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8a99fa6d096c..f9d320828067 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,73 @@ module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); #endif +enum { + ONLINE_POLICY_CONTIG_ZONES = 0, + ONLINE_POLICY_AUTO_MOVABLE, +}; + +const char *online_policy_to_str[] = { + [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", + [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", +}; + +static int set_online_policy(const char *val, const struct kernel_param *kp) +{ + int ret = sysfs_match_string(online_policy_to_str, val); + + if (ret < 0) + return ret; + *((int *)kp->arg) = ret; + return 0; +} + +static int get_online_policy(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); +} + +/* + * memory_hotplug.online_policy: configure online behavior when onlining without + * specifying a zone (MMOP_ONLINE) + * + * "contig-zones": keep zone contiguous + * "auto-movable": online memory to ZONE_MOVABLE if the configuration + * (auto_movable_ratio, auto_movable_numa_aware) allows for it + */ +static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; +static const struct kernel_param_ops online_policy_ops = { + .set = set_online_policy, + .get = get_online_policy, +}; +module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); +MODULE_PARM_DESC(online_policy, + "Set the online policy (\"contig-zones\", \"auto-movable\") " + "Default: \"contig-zones\""); + +/* + * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio + * + * The ratio represent an upper limit and the kernel might decide to not + * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory + * doesn't allow for more MOVABLE memory. + */ +static unsigned int auto_movable_ratio __read_mostly = 301; +module_param(auto_movable_ratio, uint, 0644); +MODULE_PARM_DESC(auto_movable_ratio, + "Set the maximum ratio of MOVABLE:KERNEL memory in the system " + "in percent for \"auto-movable\" online policy. Default: 301"); + +/* + * memory_hotplug.auto_movable_numa_aware: consider numa node stats + */ +#ifdef CONFIG_NUMA +static bool auto_movable_numa_aware __read_mostly = true; +module_param(auto_movable_numa_aware, bool, 0644); +MODULE_PARM_DESC(auto_movable_numa_aware, + "Consider numa node stats in addition to global stats in " + "\"auto-movable\" online policy. Default: true"); +#endif /* CONFIG_NUMA */ + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -663,6 +730,61 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, set_zone_contiguous(zone); } +struct auto_movable_stats { + unsigned long kernel_early_pages; + unsigned long movable_pages; +}; + +static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, + struct zone *zone) +{ + if (zone_idx(zone) == ZONE_MOVABLE) { + stats->movable_pages += zone->present_pages; + } else { + stats->kernel_early_pages += zone->present_early_pages; +#ifdef CONFIG_CMA + /* + * CMA pages (never on hotplugged memory) behave like + * ZONE_MOVABLE. + */ + stats->movable_pages += zone->cma_pages; + stats->kernel_early_pages -= zone->cma_pages; +#endif /* CONFIG_CMA */ + } +} + +static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages) +{ + struct auto_movable_stats stats = {}; + unsigned long kernel_early_pages, movable_pages; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int i; + + /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ + if (nid == NUMA_NO_NODE) { + /* TODO: cache values */ + for_each_populated_zone(zone) + auto_movable_stats_account_zone(&stats, zone); + } else { + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (populated_zone(zone)) + auto_movable_stats_account_zone(&stats, zone); + } + } + + kernel_early_pages = stats.kernel_early_pages; + movable_pages = stats.movable_pages; + + /* + * Test if we could online the given number of pages to ZONE_MOVABLE + * and still stay in the configured ratio. + */ + movable_pages += nr_pages; + return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; +} + /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go @@ -684,6 +806,72 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn return &pgdat->node_zones[ZONE_NORMAL]; } +/* + * Determine to which zone to online memory dynamically based on user + * configuration and system stats. We care about the following ratio: + * + * MOVABLE : KERNEL + * + * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in + * one of the kernel zones. CMA pages inside one of the kernel zones really + * behaves like ZONE_MOVABLE, so we treat them accordingly. + * + * We don't allow for hotplugged memory in a KERNEL zone to increase the + * amount of MOVABLE memory we can have, so we end up with: + * + * MOVABLE : KERNEL_EARLY + * + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * boot. We base our calculation on KERNEL_EARLY internally, because: + * + * a) Hotplugged memory in one of the kernel zones can sometimes still get + * hotunplugged, especially when hot(un)plugging individual memory blocks. + * There is no coordination across memory devices, therefore "automatic" + * hotunplugging, as implemented in hypervisors, could result in zone + * imbalances. + * b) Early/boot memory in one of the kernel zones can usually not get + * hotunplugged again (e.g., no firmware interface to unplug, fragmented + * with unmovable allocations). While there are corner cases where it might + * still work, it is barely relevant in practice. + * + * We rely on "present pages" instead of "managed pages", as the latter is + * highly unreliable and dynamic in virtualized environments, and does not + * consider boot time allocations. For example, memory ballooning adjusts the + * managed pages when inflating/deflating the balloon, and balloon compaction + * can even migrate inflated pages between zones. + * + * Using "present pages" is better but some things to keep in mind are: + * + * a) Some memblock allocations, such as for the crashkernel area, are + * effectively unused by the kernel, yet they account to "present pages". + * Fortunately, these allocations are comparatively small in relevant setups + * (e.g., fraction of system memory). + * b) Some hotplugged memory blocks in virtualized environments, esecially + * hotplugged by virtio-mem, look like they are completely present, however, + * only parts of the memory block are actually currently usable. + * "present pages" is an upper limit that can get reached at runtime. As + * we base our calculations on KERNEL_EARLY, this is not an issue. + */ +static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn, + unsigned long nr_pages) +{ + if (!auto_movable_ratio) + goto kernel_zone; + + if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages)) + goto kernel_zone; + +#ifdef CONFIG_NUMA + if (auto_movable_numa_aware && + !auto_movable_can_online_movable(nid, nr_pages)) + goto kernel_zone; +#endif /* CONFIG_NUMA */ + + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; +kernel_zone: + return default_kernel_zone_for_pfn(nid, pfn, nr_pages); +} + static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { @@ -717,6 +905,9 @@ struct zone *zone_for_pfn_range(int online_type, int nid, if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) + return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages); + return default_zone_for_pfn(nid, start_pfn, nr_pages); }