4 files changed, 228 insertions, 5 deletions
diff --git a/arch/s390/include/asm/hiperdispatch.h b/arch/s390/include/asm/hiperdispatch.h
new file mode 100644
index 000000000000..27e23aa27a24
--- /dev/null
+++ b/arch/s390/include/asm/hiperdispatch.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#ifndef _ASM_HIPERDISPATCH_H
+#define _ASM_HIPERDISPATCH_H
+
+void hd_reset_state(void);
+void hd_add_core(int cpu);
+void hd_disable_hiperdispatch(void);
+int hd_enable_hiperdispatch(void);
+
+#endif /* _ASM_HIPERDISPATCH_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index badeaa5ccd83..5ceb08b338d3 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -51,7 +51,7 @@ obj-$(CONFIG_SYSFS)		+= nospec-sysfs.o
 CFLAGS_REMOVE_nospec-branch.o	+= $(CC_FLAGS_EXPOLINE)
 
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o
+obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o hiperdispatch.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_AUDIT)		+= audit.o
 compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c
new file mode 100644
index 000000000000..233872d59b76
--- /dev/null
+++ b/arch/s390/kernel/hiperdispatch.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "hd"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+/*
+ * Hiperdispatch:
+ * Dynamically calculates the optimum number of high capacity COREs
+ * by considering the state the system is in. When hiperdispatch decides
+ * that a capacity update is necessary, it schedules a topology update.
+ * During topology updates the CPU capacities are always re-adjusted.
+ *
+ * There is two places where CPU capacities are being accessed within
+ * hiperdispatch.
+ * -> hiperdispatch's reoccuring work function reads CPU capacities to
+ *    determine high capacity CPU count.
+ * -> during a topology update hiperdispatch's adjustment function
+ *    updates CPU capacities.
+ * These two can run on different CPUs in parallel which can cause
+ * hiperdispatch to make wrong decisions. This can potentially cause
+ * some overhead by leading to extra rebuild_sched_domains() calls
+ * for correction. Access to capacities within hiperdispatch has to be
+ * serialized to prevent the overhead.
+ *
+ * Hiperdispatch decision making revolves around steal time.
+ * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
+ * crosses the threshold value hiperdispatch falls back to giving high
+ * capacities to entitled CPUs. When steal time drops below the
+ * threshold boundary, hiperdispatch utilizes all CPUs by giving all
+ * of them high capacity.
+ *
+ * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
+ * performance. Comparing the throughput of;
+ * - single CORE, with N threads, running N tasks
+ * - N separate COREs running N tasks,
+ * using individual COREs for individual tasks yield better
+ * performance. This performance difference is roughly ~30% (can change
+ * between machine generations)
+ *
+ * Hiperdispatch tries to hint scheduler to use individual COREs for
+ * each task, as long as steal time on those COREs are less than 30%,
+ * therefore delaying the throughput loss caused by using SMP threads.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
+#include <linux/ktime.h>
+#include <linux/workqueue.h>
+#include <asm/hiperdispatch.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+
+#define HD_DELAY_FACTOR			(4)
+#define HD_DELAY_INTERVAL		(HZ / 4)
+#define HD_STEAL_THRESHOLD		30
+
+static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
+static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
+static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
+static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
+static int hd_online_cores;		/* Current online CORE count */
+
+static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
+
+static void hd_capacity_work_fn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
+
+void hd_reset_state(void)
+{
+	cpumask_clear(&hd_vl_coremask);
+	cpumask_clear(&hd_vmvl_cpumask);
+	hd_entitled_cores = 0;
+	hd_online_cores = 0;
+}
+
+void hd_add_core(int cpu)
+{
+	const struct cpumask *siblings;
+	int polarization;
+
+	hd_online_cores++;
+	polarization = smp_cpu_get_polarization(cpu);
+	siblings = topology_sibling_cpumask(cpu);
+	switch (polarization) {
+	case POLARIZATION_VH:
+		hd_entitled_cores++;
+		break;
+	case POLARIZATION_VM:
+		hd_entitled_cores++;
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	case POLARIZATION_VL:
+		cpumask_set_cpu(cpu, &hd_vl_coremask);
+		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
+		break;
+	}
+}
+
+static void hd_update_capacities(void)
+{
+	int cpu, upscaling_cores;
+	unsigned long capacity;
+
+	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
+	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
+	hd_high_capacity_cores = hd_entitled_cores;
+	for_each_cpu(cpu, &hd_vl_coremask) {
+		smp_set_core_capacity(cpu, capacity);
+		if (capacity != CPU_CAPACITY_HIGH)
+			continue;
+		hd_high_capacity_cores++;
+		upscaling_cores--;
+		if (upscaling_cores == 0)
+			capacity = CPU_CAPACITY_LOW;
+	}
+}
+
+void hd_disable_hiperdispatch(void)
+{
+	cancel_delayed_work_sync(&hd_capacity_work);
+	hd_high_capacity_cores = hd_online_cores;
+	hd_previous_steal = 0;
+}
+
+int hd_enable_hiperdispatch(void)
+{
+	if (hd_entitled_cores == 0)
+		return 0;
+	if (hd_online_cores <= hd_entitled_cores)
+		return 0;
+	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * HD_DELAY_FACTOR);
+	hd_update_capacities();
+	return 1;
+}
+
+static unsigned long hd_calculate_steal_percentage(void)
+{
+	unsigned long time_delta, steal_delta, steal, percentage;
+	static ktime_t prev;
+	int cpus, cpu;
+	ktime_t now;
+
+	cpus = 0;
+	steal = 0;
+	percentage = 0;
+	for_each_cpu(cpu, &hd_vmvl_cpumask) {
+		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+		cpus++;
+	}
+	/*
+	 * If there is no vertical medium and low CPUs steal time
+	 * is 0 as vertical high CPUs shouldn't experience steal time.
+	 */
+	if (cpus == 0)
+		return percentage;
+	now = ktime_get();
+	time_delta = ktime_to_ns(ktime_sub(now, prev));
+	if (steal > hd_previous_steal && hd_previous_steal != 0) {
+		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
+		percentage = steal_delta / cpus;
+	}
+	hd_previous_steal = steal;
+	prev = now;
+	return percentage;
+}
+
+static void hd_capacity_work_fn(struct work_struct *work)
+{
+	unsigned long steal_percentage, new_cores;
+
+	mutex_lock(&smp_cpu_state_mutex);
+	/*
+	 * If online cores are less or equal to entitled cores hiperdispatch
+	 * does not need to make any adjustments, call a topology update to
+	 * disable hiperdispatch.
+	 * Normally this check is handled on topology update, but during cpu
+	 * unhotplug, topology and cpu mask updates are done in reverse
+	 * order, causing hd_enable_hiperdispatch() to get stale data.
+	 */
+	if (hd_online_cores <= hd_entitled_cores) {
+		topology_schedule_update();
+		mutex_unlock(&smp_cpu_state_mutex);
+		return;
+	}
+	steal_percentage = hd_calculate_steal_percentage();
+	if (steal_percentage < HD_STEAL_THRESHOLD)
+		new_cores = hd_online_cores;
+	else
+		new_cores = hd_entitled_cores;
+	if (hd_high_capacity_cores != new_cores) {
+		hd_high_capacity_cores = new_cores;
+		topology_schedule_update();
+	}
+	mutex_unlock(&smp_cpu_state_mutex);
+	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
+}
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 58da6d1bae45..813e5da9a973 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -24,6 +24,7 @@
 #include <linux/mm.h>
 #include <linux/nodemask.h>
 #include <linux/node.h>
+#include <asm/hiperdispatch.h>
 #include <asm/sysinfo.h>
 
 #define PTF_HORIZONTAL	(0UL)
@@ -47,6 +48,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED;
 static void set_topology_timer(void);
 static void topology_work_fn(struct work_struct *work);
 static struct sysinfo_15_1_x *tl_info;
+static int cpu_management;
 
 static DECLARE_WORK(topology_work, topology_work_fn);
 
@@ -144,6 +146,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
 			cpumask_set_cpu(cpu, &book->mask);
 			cpumask_set_cpu(cpu, &socket->mask);
 			smp_cpu_set_polarization(cpu, tl_core->pp);
+			smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH);
 		}
 	}
 }
@@ -270,6 +273,7 @@ void update_cpu_masks(void)
 			topo->drawer_id = id;
 		}
 	}
+	hd_reset_state();
 	for_each_online_cpu(cpu) {
 		topo = &cpu_topology[cpu];
 		pkg_first = cpumask_first(&topo->core_mask);
@@ -278,8 +282,10 @@ void update_cpu_masks(void)
 			for_each_cpu(sibling, &topo->core_mask) {
 				topo_sibling = &cpu_topology[sibling];
 				smt_first = cpumask_first(&topo_sibling->thread_mask);
-				if (sibling == smt_first)
+				if (sibling == smt_first) {
 					topo_package->booted_cores++;
+					hd_add_core(sibling);
+				}
 			}
 		} else {
 			topo->booted_cores = topo_package->booted_cores;
@@ -303,8 +309,10 @@ static void __arch_update_dedicated_flag(void *arg)
 static int __arch_update_cpu_topology(void)
 {
 	struct sysinfo_15_1_x *info = tl_info;
-	int rc = 0;
+	int rc, hd_status;
 
+	hd_status = 0;
+	rc = 0;
 	mutex_lock(&smp_cpu_state_mutex);
 	if (MACHINE_HAS_TOPOLOGY) {
 		rc = 1;
@@ -314,7 +322,11 @@ static int __arch_update_cpu_topology(void)
 	update_cpu_masks();
 	if (!MACHINE_HAS_TOPOLOGY)
 		topology_update_polarization_simple();
+	if (cpu_management == 1)
+		hd_status = hd_enable_hiperdispatch();
 	mutex_unlock(&smp_cpu_state_mutex);
+	if (hd_status == 0)
+		hd_disable_hiperdispatch();
 	return rc;
 }
 
@@ -374,8 +386,6 @@ void topology_expect_change(void)
 	set_topology_timer();
 }
 
-static int cpu_management;
-
 static int set_polarization(int polarization)
 {
 	int rc = 0;