diff options
author | Lukasz Luba <lukasz.luba@arm.com> | 2024-02-08 11:55:49 +0000 |
---|---|---|
committer | Rafael J. Wysocki <rafael.j.wysocki@intel.com> | 2024-02-08 15:00:31 +0100 |
commit | 1b600da510735a0f92c8b4140a7e2cb037a6a6c3 (patch) | |
tree | ea67df6e89321d69a5ae51acbbc6cc183dc15562 /include/linux/energy_model.h | |
parent | e3f1164fc9ee8430b3a51e400abfa1b67664f538 (diff) |
PM: EM: Optimize em_cpu_energy() and remove division
The Energy Model (EM) can be modified at runtime which brings new
possibilities. The em_cpu_energy() is called by the Energy Aware Scheduler
(EAS) in its hot path. The energy calculation uses power value for
a given performance state (ps) and the CPU busy time as percentage for that
given frequency.
It is possible to avoid the division by 'scale_cpu' at runtime, because
EM is updated whenever new max capacity CPU is set in the system.
Use that feature and do the needed division during the calculation of the
coefficient 'ps->cost'. That enhanced 'ps->cost' value can be then just
multiplied simply by utilization:
pd_nrg = ps->cost * \Sum cpu_util
to get the needed energy for whole Performance Domain (PD).
With this optimization and earlier removal of map_util_freq(), the
em_cpu_energy() should run faster on the Big CPU by 1.43x and on the Little
CPU by 1.69x (RockPi 4B board).
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Diffstat (limited to 'include/linux/energy_model.h')
-rw-r--r-- | include/linux/energy_model.h | 55 |
1 files changed, 15 insertions, 40 deletions
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index ce24ea3fe41c..aabfc26fcd31 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -115,27 +115,6 @@ struct em_perf_domain { #define EM_MAX_NUM_CPUS 16 #endif -/* - * To avoid an overflow on 32bit machines while calculating the energy - * use a different order in the operation. First divide by the 'cpu_scale' - * which would reduce big value stored in the 'cost' field, then multiply by - * the 'sum_util'. This would allow to handle existing platforms, which have - * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts. - * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util' - * could be 4096, then multiplication: 'cost' * 'sum_util' would overflow. - * This reordering of operations has some limitations, we lose small - * precision in the estimation (comparing to 64bit platform w/o reordering). - * - * We are safe on 64bit machine. - */ -#ifdef CONFIG_64BIT -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) * (sum_util)) / (scale_cpu)) -#else -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) / (scale_cpu)) * (sum_util)) -#endif - struct em_data_callback { /** * active_power() - Provide power at the next performance state of @@ -249,8 +228,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { struct em_perf_table *em_table; struct em_perf_state *ps; - unsigned long scale_cpu; - int cpu, i; + int i; #ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); @@ -267,9 +245,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - cpu = cpumask_first(to_cpumask(pd->cpus)); - scale_cpu = arch_scale_cpu_capacity(cpu); - + max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* @@ -282,12 +258,12 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, ps = &em_table->state[i]; /* - * The capacity of a CPU in the domain at the performance state (ps) - * can be computed as: + * The performance (capacity) of a CPU in the domain at the performance + * state (ps) can be computed as: * - * ps->freq * scale_cpu - * ps->cap = -------------------- (1) - * cpu_max_freq + * ps->freq * scale_cpu + * ps->performance = -------------------- (1) + * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in * the EM), the energy consumed by this CPU at that performance state @@ -295,9 +271,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * * ps->power * cpu_util * cpu_nrg = -------------------- (2) - * ps->cap + * ps->performance * - * since 'cpu_util / ps->cap' represents its percentage of busy time. + * since 'cpu_util / ps->performance' represents its percentage of busy + * time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value @@ -307,9 +284,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * - * ps->power * cpu_max_freq cpu_util - * cpu_nrg = ------------------------ * --------- (3) - * ps->freq scale_cpu + * ps->power * cpu_max_freq + * cpu_nrg = ------------------------ * cpu_util (3) + * ps->freq * scale_cpu * * The first term is static, and is stored in the em_perf_state struct * as 'ps->cost'. @@ -319,11 +296,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * - * ps->cost * \Sum cpu_util - * pd_nrg = ------------------------ (4) - * scale_cpu + * pd_nrg = ps->cost * \Sum cpu_util (4) */ - return em_estimate_energy(ps->cost, sum_util, scale_cpu); + return ps->cost * sum_util; } /** |