diff options
-rw-r--r-- | arch/s390/include/asm/hiperdispatch.h | 14 | ||||
-rw-r--r-- | arch/s390/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/kernel/hiperdispatch.c | 199 | ||||
-rw-r--r-- | arch/s390/kernel/topology.c | 18 |
4 files changed, 228 insertions, 5 deletions
diff --git a/arch/s390/include/asm/hiperdispatch.h b/arch/s390/include/asm/hiperdispatch.h new file mode 100644 index 000000000000..27e23aa27a24 --- /dev/null +++ b/arch/s390/include/asm/hiperdispatch.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2024 + */ + +#ifndef _ASM_HIPERDISPATCH_H +#define _ASM_HIPERDISPATCH_H + +void hd_reset_state(void); +void hd_add_core(int cpu); +void hd_disable_hiperdispatch(void); +int hd_enable_hiperdispatch(void); + +#endif /* _ASM_HIPERDISPATCH_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index badeaa5ccd83..5ceb08b338d3 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -51,7 +51,7 @@ obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..233872d59b76 --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "hd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpumask.h> +#include <linux/kernel_stat.h> +#include <linux/ktime.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 30 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * HD_DELAY_FACTOR); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_calculate_steal_percentage(); + if (steal_percentage < HD_STEAL_THRESHOLD) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + hd_high_capacity_cores = new_cores; + topology_schedule_update(); + } + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 58da6d1bae45..813e5da9a973 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -24,6 +24,7 @@ #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/node.h> +#include <asm/hiperdispatch.h> #include <asm/sysinfo.h> #define PTF_HORIZONTAL (0UL) @@ -47,6 +48,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; +static int cpu_management; static DECLARE_WORK(topology_work, topology_work_fn); @@ -144,6 +146,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(cpu, &book->mask); cpumask_set_cpu(cpu, &socket->mask); smp_cpu_set_polarization(cpu, tl_core->pp); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); } } } @@ -270,6 +273,7 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + hd_reset_state(); for_each_online_cpu(cpu) { topo = &cpu_topology[cpu]; pkg_first = cpumask_first(&topo->core_mask); @@ -278,8 +282,10 @@ void update_cpu_masks(void) for_each_cpu(sibling, &topo->core_mask) { topo_sibling = &cpu_topology[sibling]; smt_first = cpumask_first(&topo_sibling->thread_mask); - if (sibling == smt_first) + if (sibling == smt_first) { topo_package->booted_cores++; + hd_add_core(sibling); + } } } else { topo->booted_cores = topo_package->booted_cores; @@ -303,8 +309,10 @@ static void __arch_update_dedicated_flag(void *arg) static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - int rc = 0; + int rc, hd_status; + hd_status = 0; + rc = 0; mutex_lock(&smp_cpu_state_mutex); if (MACHINE_HAS_TOPOLOGY) { rc = 1; @@ -314,7 +322,11 @@ static int __arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + if (cpu_management == 1) + hd_status = hd_enable_hiperdispatch(); mutex_unlock(&smp_cpu_state_mutex); + if (hd_status == 0) + hd_disable_hiperdispatch(); return rc; } @@ -374,8 +386,6 @@ void topology_expect_change(void) set_topology_timer(); } -static int cpu_management; - static int set_polarization(int polarization) { int rc = 0; |