aboutsummaryrefslogtreecommitdiff
path: root/kernel/power/energy_model.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 16:04:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 16:04:28 -0700
commit833db72142b93a89211c1e43ca0a1e2e16457756 (patch)
treeac074094a3ff4054478273023f0f8b05dd6318fa /kernel/power/energy_model.c
parentc0d6586afa3546a3d148cf4b9d9a407b4f79d0bb (diff)
parentbf56b90797c4a03c94b702c50e62296edea9fad9 (diff)
Merge tag 'pm-5.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management updates from Rafael Wysocki: "These make the power management of PCI devices with ACPI companions more straightforwad, add support for inefficient operating performance points to the Energy model and make cpufreq handle them as appropriate, rearrange the handling of cpuidle during system PM transitions, update a few cpufreq drivers and intel_idle, fix assorded issues and clean up code in multiple places. Specifics: - Add support for inefficient operating performance points to the Energy Model and modify cpufreq to use them properly (Vincent Donnefort). - Rearrange the DTPM framework code to simplify it and make it easier to follow (Daniel Lezcano). - Fix power intialization in DTPM (Daniel Lezcano). - Add CPU load consideration when estimating the instaneous power consumption in DTPM (Daniel Lezcano). - Fix cpu->pstate.turbo_freq initialization in intel_pstate (Zhang Rui). - Make intel_pstate process HWP Guaranteed change notifications from the processor (Srinivas Pandruvada). - Fix typo in cpufreq.h (Rafael Wysocki). - Fix tegra driver to handle BPMP errors properly (Mikko Perttunen). - Fix the parameter usage of the newly added perf-domain API (Hector Yuan). - Minor cleanups to cppc, vexpress and s3c244x drivers (Han Wang, Guenter Roeck, and Arnd Bergmann). - Fix kobject memory leaks in cpuidle error paths (Anel Orazgaliyeva). - Make intel_idle enable interrupts before entering C1 on some Xeon processor models (Artem Bityutskiy). - Clean up hib_wait_io() (Falla Coulibaly). - Fix sparse warnings in hibernation-related code (Anders Roxell). - Use vzalloc() and kzalloc() instead of their open-coded equivalents in hibernation-related code (Cai Huoqing). - Prevent user space from crashing the kernel by attempting to restore the system state from a swap partition in use (Ye Bin). - Do not let "syscore" devices runtime-suspend during system PM transitions (Rafael Wysocki). - Do not pause cpuidle in the suspend-to-idle path (Rafael Wysocki). - Pause cpuidle later and resume it earlier during system PM transitions (Rafael Wysocki). - Make system suspend code use valid_state() consistently (Rafael Wysocki). - Add support for enabling wakeup IRQs after invoking the ->runtime_suspend() callback and make two drivers use it (Chunfeng Yun). - Make the association of ACPI device objects with PCI devices more straightforward and simplify the code doing that for all devices in general (Rafael Wysocki). - Eliminate struct pci_platform_pm_ops and handle the both of its users (PCI and Intel MID) directly in the PCI bus code (Rafael Wysocki). - Simplify and clarify ACPI PCI device PM helpers (Rafael Wysocki). - Fix ordering of operations in pci_back_from_sleep() (Rafael Wysocki). - Make exynos-ppmu use hyphens in DT properties (Krzysztof Kozlowski). - Simplify parsing event-type from DT in exynos-ppmu (Krzysztof Kozlowski). - Strengthen check for freq_table in devfreq (Samuel Holland)" * tag 'pm-5.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: (49 commits) cpufreq: Fix parameter in parse_perf_domain() usb: mtu3: enable wake-up interrupt after runtime_suspend called usb: xhci-mtk: enable wake-up interrupt after runtime_suspend called PM / wakeirq: support enabling wake-up irq after runtime_suspend called PM / devfreq: Strengthen check for freq_table devfreq: exynos-ppmu: simplify parsing event-type from DT devfreq: exynos-ppmu: use node names with hyphens cpufreq: intel_pstate: Fix cpu->pstate.turbo_freq initialization PM: suspend: Use valid_state() consistently PM: sleep: Pause cpuidle later and resume it earlier during system transitions PM: suspend: Do not pause cpuidle in the suspend-to-idle path PM: sleep: Do not let "syscore" devices runtime-suspend during system transitions PM: hibernate: Get block device exclusively in swsusp_check() powercap/drivers/dtpm: Fix power limit initialization powercap/drivers/dtpm: Scale the power with the load powercap/drivers/dtpm: Use container_of instead of a private data field powercap/drivers/dtpm: Simplify the dtpm table powercap/drivers/dtpm: Encapsulate even more the code PM: hibernate: swap: Use vzalloc() and kzalloc() PM: hibernate: fix sparse warnings ...
Diffstat (limited to 'kernel/power/energy_model.c')
-rw-r--r--kernel/power/energy_model.c86
1 files changed, 68 insertions, 18 deletions
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index a332ccd829e2..0153b0ca7b23 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -2,7 +2,7 @@
/*
* Energy Model of devices
*
- * Copyright (c) 2018-2020, Arm ltd.
+ * Copyright (c) 2018-2021, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
* Improvements provided by: Lukasz Luba, Arm ltd.
*/
@@ -10,6 +10,7 @@
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
@@ -42,6 +43,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
debugfs_create_ulong("power", 0444, d, &ps->power);
debugfs_create_ulong("cost", 0444, d, &ps->cost);
+ debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -55,7 +57,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
static int em_debug_units_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
+ char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
+ "milliWatts" : "bogoWatts";
seq_printf(s, "%s\n", units);
@@ -63,6 +66,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+ int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
+
+ seq_printf(s, "%d\n", enabled);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+
static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
@@ -76,6 +90,8 @@ static void em_debug_create_pd(struct device *dev)
&em_debug_cpus_fops);
debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+ debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
+ &em_debug_skip_inefficiencies_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -107,8 +123,7 @@ static void em_debug_remove_pd(struct device *dev) {}
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
int nr_states, struct em_data_callback *cb)
{
- unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
- unsigned long power, freq, prev_freq = 0;
+ unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
int i, ret;
u64 fmax;
@@ -153,27 +168,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
table[i].power = power;
table[i].frequency = prev_freq = freq;
-
- /*
- * The hertz/watts efficiency ratio should decrease as the
- * frequency grows on sane platforms. But this isn't always
- * true in practice so warn the user if a higher OPP is more
- * power efficient than a lower one.
- */
- opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
- dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
- i, i - 1);
- prev_opp_eff = opp_eff;
}
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
- for (i = 0; i < nr_states; i++) {
+ for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res = em_scale_power(table[i].power);
table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
}
pd->table = table;
@@ -222,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states,
return 0;
}
+static void em_cpufreq_update_efficiencies(struct device *dev)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct em_perf_state *table;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i;
+
+ if (!_is_cpu_device(dev) || !pd)
+ return;
+
+ policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ return;
+ }
+
+ table = pd->table;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
+}
+
/**
* em_pd_get() - Return the performance domain for a device
* @dev : Device to find the performance domain for
@@ -335,7 +382,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
if (ret)
goto unlock;
- dev->em_pd->milliwatts = milliwatts;
+ if (milliwatts)
+ dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+
+ em_cpufreq_update_efficiencies(dev);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");