diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index d359bcfadd39..fdfd2b684822 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst). numa_balancing ============== -Enables/disables automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes -that access it often. +Enables/disables and configures automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes that access it often. +The value to set can be the result of ORing the following: -Enables/disables automatic NUMA memory balancing. On NUMA machines, there -is a performance penalty if remote memory is accessed by a CPU. When this -feature is enabled the kernel samples what task thread is accessing memory -by periodically unmapping pages and later trapping a page fault. At the -time of the page fault, it is determined if the data being accessed should -be migrated to a local memory node. += ================================= +0 NUMA_BALANCING_DISABLED +1 NUMA_BALANCING_NORMAL +2 NUMA_BALANCING_MEMORY_TIERING += ================================= + +Or NUMA_BALANCING_NORMAL to optimize page placement among different +NUMA nodes to reduce remote accessing. On NUMA machines, there is a +performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing +memory by periodically unmapping pages and later trapping a page +fault. At the time of the page fault, it is determined if the data +being accessed should be migrated to a local memory node. The unmapping of pages and trapping faults incur additional overhead that ideally is offset by improved memory locality but there is no universal @@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls. +Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among +different types of memory (represented as different NUMA nodes) to +place the hot pages in the fast memory. This is implemented based on +unmapping and page fault too. numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb =============================================================================================================================== diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 310b6e7ce58a..962b14d403e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -353,6 +353,7 @@ enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, + WMARK_PROMO, NR_WMARK }; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c19dd5a2c05c..b5eec8854c5a 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -23,6 +23,16 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_END, }; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 + +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; +#else +#define sysctl_numa_balancing_mode 0 +#endif + /* * control realtime throttling: * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9745613d531c..da6a60383645 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4279,7 +4279,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4287,13 +4289,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4303,8 +4314,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 730ab56d9e92..3395b99d59a4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1696,7 +1696,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { diff --git a/mm/migrate.c b/mm/migrate.c index dc4adf979201..78b2cf87946d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -2031,16 +2032,27 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; int nr_pages = thp_nr_pages(page); + int order = compound_order(page); - VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, nr_pages)) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (populated_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; + } if (isolate_lru_page(page)) return 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a573aa9f5160..8b18a077c409 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8441,7 +8441,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; - zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/vmscan.c b/mm/vmscan.c index f5ec53f19f3b..499fa86e754a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include #include +#include #include "internal.h" @@ -3895,7 +3896,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) if (!managed_zone(zone)) continue; - mark = high_wmark_pages(zone); + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + mark = wmark_pages(zone, WMARK_PROMO); + else + mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; }