From afde996a33ee4dbe3692e1eff28b56c820331428 Mon Sep 17 00:00:00 2001 From: Dhruva Gole Date: Mon, 18 Mar 2024 20:46:32 +0530 Subject: PM: wakeup: make device_wakeup_disable() return void The device_wakeup_disable() call only returns an error if no dev exists, but there's not much a user can do at that point. Rather, make this function return void. Signed-off-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 6eb9adaef52b..428803eed798 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -107,7 +107,7 @@ extern void wakeup_sources_read_unlock(int idx); extern struct wakeup_source *wakeup_sources_walk_start(void); extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws); extern int device_wakeup_enable(struct device *dev); -extern int device_wakeup_disable(struct device *dev); +extern void device_wakeup_disable(struct device *dev); extern void device_set_wakeup_capable(struct device *dev, bool capable); extern int device_set_wakeup_enable(struct device *dev, bool enable); extern void __pm_stay_awake(struct wakeup_source *ws); @@ -154,10 +154,9 @@ static inline int device_wakeup_enable(struct device *dev) return 0; } -static inline int device_wakeup_disable(struct device *dev) +static inline void device_wakeup_disable(struct device *dev) { dev->power.should_wakeup = false; - return 0; } static inline int device_set_wakeup_enable(struct device *dev, bool enable) -- cgit From 3642c7ed52312ac2b95c9aba45c40e50bd8798ad Mon Sep 17 00:00:00 2001 From: Dhruva Gole Date: Mon, 18 Mar 2024 20:46:33 +0530 Subject: PM: wakeup: Remove unnecessary else from device_init_wakeup() Checkpatch warns that else is generally not necessary after a return condition which exists in the if part of this function. Hence, just to abide by what checkpatch recommends, follow it's guidelines. Signed-off-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 428803eed798..76cd1f9f1365 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -234,11 +234,10 @@ static inline int device_init_wakeup(struct device *dev, bool enable) if (enable) { device_set_wakeup_capable(dev, true); return device_wakeup_enable(dev); - } else { - device_wakeup_disable(dev); - device_set_wakeup_capable(dev, false); - return 0; } + device_wakeup_disable(dev); + device_set_wakeup_capable(dev, false); + return 0; } #endif /* _LINUX_PM_WAKEUP_H */ -- cgit From e3ac0f367d5806af09d2070bb7951af2f59d1f52 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:04 +0100 Subject: OPP: OF: Export dev_opp_pm_calc_power() for usage from EM There are device drivers which can modify voltage values for OPPs. It could be due to the chip binning and those drivers have specific chip knowledge about it. This adjustment can happen after Energy Model is registered, thus EM can have stale data about power. Export dev_opp_pm_calc_power() which can be used by Energy Model to calculate new power with the new voltage for OPPs. Acked-by: Viresh Kumar Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/opp/of.c | 17 ++++++++++++----- include/linux/pm_opp.h | 8 ++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/opp/of.c b/drivers/opp/of.c index f9f0b22bccbb..282eb5966fd0 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1494,20 +1494,26 @@ _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) return 0; } -/* - * Callback function provided to the Energy Model framework upon registration. +/** + * dev_pm_opp_calc_power() - Calculate power value for device with EM + * @dev : Device for which an Energy Model has to be registered + * @uW : New power value that is calculated + * @kHz : Frequency for which the new power is calculated + * * This computes the power estimated by @dev at @kHz if it is the frequency * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled * frequency and @uW to the associated power. The power is estimated as * P = C * V^2 * f with C being the device's capacitance and V and f * respectively the voltage and frequency of the OPP. + * It is also used as a callback function provided to the Energy Model + * framework upon registration. * * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, - unsigned long *kHz) +int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; @@ -1544,6 +1550,7 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, return 0; } +EXPORT_SYMBOL_GPL(dev_pm_opp_calc_power); static bool _of_has_opp_microwatt_property(struct device *dev) { @@ -1619,7 +1626,7 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) goto failed; } - EM_SET_ACTIVE_POWER_CB(em_cb, _get_power); + EM_SET_ACTIVE_POWER_CB(em_cb, dev_pm_opp_calc_power); register_em: ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 065a47382302..dd7c8441af42 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -476,6 +476,8 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp); int of_get_required_opp_performance_state(struct device_node *np, int index); int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table); int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus); +int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz); static inline void dev_pm_opp_of_unregister_em(struct device *dev) { em_dev_unregister_perf_domain(dev); @@ -539,6 +541,12 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev) { } +static inline int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW, + unsigned long *kHz) +{ + return -EOPNOTSUPP; +} + static inline int of_get_required_opp_performance_state(struct device_node *np, int index) { return -EOPNOTSUPP; -- cgit From cf61d53b026805e8222ca28ac2795611eb7fa547 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 3 Apr 2024 16:49:06 +0100 Subject: PM: EM: Add em_dev_update_chip_binning() Add a function which allows to modify easily the EM after the new voltage information is available. The device drivers for the chip can adjust the voltage values after setup. The voltage for the same frequency in OPP can be different due to chip binning. The voltage impacts the power usage and the EM power values can be updated to reflect that. Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 5 +++++ kernel/power/energy_model.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 70cd7258cd29..1ff52020cf75 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -172,6 +172,7 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); +int em_dev_update_chip_binning(struct device *dev); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -386,6 +387,10 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, { return -EINVAL; } +static inline int em_dev_update_chip_binning(struct device *dev) +{ + return -EINVAL; +} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6960dd7393b2..927cc55ba0b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -808,3 +808,51 @@ static void em_update_workfn(struct work_struct *work) { em_check_capacity_update(); } + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table __rcu *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); -- cgit From b37ef7210e51b1e996ca03b03227d93f7470784b Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 25 Apr 2024 16:07:51 +0800 Subject: cpufreq: amd-pstate: Document *_limit_* fields in struct amd_cpudata The four fields of struct cpudata namely min_limit_perf, max_limit_perf, min_limit_freq, max_limit_freq introduced in the commit febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update") are currently undocumented Add comments describing these fields Acked-by: Huang Rui Fixes: febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update") Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index d21838835abd..83fb3fc647fc 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -49,6 +49,10 @@ struct amd_aperf_mperf { * @lowest_perf: the absolute lowest performance level of the processor * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher * priority. + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min + * @max_limit_freq: Cached value of policy->max * @max_freq: the frequency that mapped to highest_perf * @min_freq: the frequency that mapped to lowest_perf * @nominal_freq: the frequency that mapped to nominal_perf -- cgit From 4fcfd1954ad305e331b6b4b62de2874fbae61394 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 25 Apr 2024 16:07:52 +0800 Subject: cpufreq: amd-pstate: Document the units for freq variables in amd_cpudata The min_limit_freq, max_limit_freq, min_freq, max_freq, nominal_freq and the lowest_nominal_freq members of struct cpudata store the frequency value in khz to be consistent with the cpufreq core. Update the comment to document this. Reviewed-by: Li Meng Tested-by: Dhananjay Ugwekar Signed-off-by: Gautham R. Shenoy Signed-off-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 83fb3fc647fc..ec0b0fa3e9bb 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -51,15 +51,15 @@ struct amd_aperf_mperf { * priority. * @min_limit_perf: Cached value of the performance corresponding to policy->min * @max_limit_perf: Cached value of the performance corresponding to policy->max - * @min_limit_freq: Cached value of policy->min - * @max_limit_freq: Cached value of policy->max - * @max_freq: the frequency that mapped to highest_perf - * @min_freq: the frequency that mapped to lowest_perf - * @nominal_freq: the frequency that mapped to nominal_perf - * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) + * @max_freq: the frequency (in khz) that mapped to highest_perf + * @min_freq: the frequency (in khz) that mapped to lowest_perf + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf * @cur: Difference of Aperf/Mperf/tsc count between last and current sample * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value + * @freq: current cpu frequency value (in khz) * @boost_supported: check whether the Processor or SBIOS supports boost mode * @hw_prefcore: check whether HW supports preferred core featue. * Only when hw_prefcore and early prefcore param are true, -- cgit From eb8b6c36820214df96e7e86d8614d93f6b028f28 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 25 Apr 2024 16:07:58 +0800 Subject: cpufreq: amd-pstate: Add quirk for the pstate CPPC capabilities missing Add quirks table to get CPPC capabilities issue fixed by providing correct perf or frequency values while driver loading. If CPPC capabilities are not defined in the ACPI tables or wrongly defined by platform firmware, it needs to use quick to get those issues fixed with correct workaround values to make pstate driver can be loaded even though there are CPPC capabilities errors. The workaround will match the broken BIOS which lack of CPPC capabilities nominal_freq and lowest_freq definition in the ACPI table. $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/lowest_freq 0 $ cat /sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq 0 Acked-by: Huang Rui Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Tested-by: Dhananjay Ugwekar Signed-off-by: Perry Yuan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 53 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/amd-pstate.h | 6 +++++ 2 files changed, 57 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 510b5aec42ea..83a29b257794 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -67,6 +67,7 @@ static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; static bool amd_pstate_prefcore = true; +static struct quirk_entry *quirks; /* * AMD Energy Preference Performance (EPP) @@ -111,6 +112,41 @@ static unsigned int epp_values[] = { typedef int (*cppc_mode_transition_fn)(int); +static struct quirk_entry quirk_amd_7k62 = { + .nominal_freq = 2600, + .lowest_freq = 550, +}; + +static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) +{ + /** + * match the broken bios for family 17h processor support CPPC V2 + * broken BIOS lack of nominal_freq and lowest_freq capabilities + * definition in ACPI tables + */ + if (boot_cpu_has(X86_FEATURE_ZEN2)) { + quirks = dmi->driver_data; + pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); + return 1; + } + + return 0; +} + +static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = { + { + .callback = dmi_matched_7k62_bios_bug, + .ident = "AMD EPYC 7K62", + .matches = { + DMI_MATCH(DMI_BIOS_VERSION, "5.14"), + DMI_MATCH(DMI_BIOS_RELEASE, "12/12/2019"), + }, + .driver_data = &quirk_amd_7k62, + }, + {} +}; +MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table); + static inline int get_mode_idx_from_str(const char *str, size_t size) { int i; @@ -812,8 +848,16 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) if (ret) return ret; - min_freq = cppc_perf.lowest_freq * 1000; - nominal_freq = cppc_perf.nominal_freq; + if (quirks && quirks->lowest_freq) + min_freq = quirks->lowest_freq * 1000; + else + min_freq = cppc_perf.lowest_freq * 1000; + + if (quirks && quirks->nominal_freq) + nominal_freq = quirks->nominal_freq ; + else + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); highest_perf = READ_ONCE(cpudata->highest_perf); @@ -1662,6 +1706,11 @@ static int __init amd_pstate_init(void) if (cpufreq_get_current_driver()) return -EEXIST; + quirks = NULL; + + /* check if this machine need CPPC quirks */ + dmi_check_system(amd_pstate_quirks_table); + switch (cppc_state) { case AMD_PSTATE_UNDEFINED: /* Disable on the following configs by default: diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index ec0b0fa3e9bb..d58fc022ec46 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -128,4 +128,10 @@ static const char * const amd_pstate_mode_string[] = { [AMD_PSTATE_GUIDED] = "guided", NULL, }; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + #endif /* _LINUX_AMD_PSTATE_H */ -- cgit From 575024a8aa7cf1dff49b94092f774ed1c90586be Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 28 Apr 2024 17:24:26 +0800 Subject: powercap: intel_rapl: Introduce APIs for PMU support Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu(). RAPL driver can invoke these APIs to expose its supported energy counters via perf PMU. The new RAPL PMU is fully compatible with current MSR RAPL PMU, including using the same PMU name and events name/id/unit/scale, etc. For example, use below command perf stat -e power/energy-pkg/ -e power/energy-ram/ FOO to get the energy consumption if power/energy-pkg/ and power/energy-ram/ events are available in the "perf list" output. This does not introduce any conflict because TPMI RAPL is the only user of these APIs currently, and it never co-exists with MSR RAPL. Note that RAPL Packages can be probed/removed dynamically, and the events supported by each TPMI RAPL device can be different. Thus the RAPL PMU support is done on demand, which means 1. PMU is registered only if it is needed by a RAPL Package. PMU events for unsupported counters are not exposed. 2. PMU is unregistered and registered when a new RAPL Package is probed and supports new counters that are not supported by current PMU. For example, on a dual-package system using TPMI RAPL, it is possible that Package 1 behaves as TPMI domain root and supports Psys domain. In this case, register PMU without Psys event when probing Package 0, and re-register the PMU with Psys event when probing Package 1. 3. PMU is unregistered when all registered RAPL Packages don't need PMU. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 582 +++++++++++++++++++++++++++++++++++ include/linux/intel_rapl.h | 32 ++ 2 files changed, 614 insertions(+) (limited to 'include/linux') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c4302caeb631..aac0744011a3 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -1507,6 +1509,586 @@ static int rapl_detect_domains(struct rapl_package *rp) return 0; } +#ifdef CONFIG_PERF_EVENTS + +/* + * Support for RAPL PMU + * + * Register a PMU if any of the registered RAPL Packages have the requirement + * of exposing its energy counters via Perf PMU. + * + * PMU Name: + * power + * + * Events: + * Name Event id RAPL Domain + * energy_cores 0x01 RAPL_DOMAIN_PP0 + * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE + * energy_ram 0x03 RAPL_DOMAIN_DRAM + * energy_gpu 0x04 RAPL_DOMAIN_PP1 + * energy_psys 0x05 RAPL_DOMAIN_PLATFORM + * + * Unit: + * Joules + * + * Scale: + * 2.3283064365386962890625e-10 + * The same RAPL domain in different RAPL Packages may have different + * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as + * the fixed unit for all energy counters, and covert each hardware + * counter increase to N times of PMU event counter increases. + * + * This is fully compatible with the current MSR RAPL PMU. This means that + * userspace programs like turbostat can use the same code to handle RAPL Perf + * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running + * underlying on the platform. + * + * Note that RAPL Packages can be probed/removed dynamically, and the events + * supported by each TPMI RAPL device can be different. Thus the RAPL PMU + * support is done on demand, which means + * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for + * unsupported counters are not exposed. + * 2. PMU is unregistered and registered when a new RAPL Package is probed and + * supports new counters that are not supported by current PMU. + * 3. PMU is unregistered when all registered RAPL Packages don't need PMU. + */ + +struct rapl_pmu { + struct pmu pmu; /* Perf PMU structure */ + u64 timer_ms; /* Maximum expiration time to avoid counter overflow */ + unsigned long domain_map; /* Events supported by current registered PMU */ + bool registered; /* Whether the PMU has been registered or not */ +}; + +static struct rapl_pmu rapl_pmu; + +/* PMU helpers */ + +static int get_pmu_cpu(struct rapl_package *rp) +{ + int cpu; + + if (!rp->has_pmu) + return nr_cpu_ids; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return nr_cpu_ids; + + /* TPMI RAPL uses any CPU in the package for PMU */ + for_each_online_cpu(cpu) + if (topology_physical_package_id(cpu) == rp->id) + return cpu; + + return nr_cpu_ids; +} + +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) +{ + if (!rp->has_pmu) + return false; + + /* Only TPMI RAPL is supported for now */ + if (rp->priv->type != RAPL_IF_TPMI) + return false; + + /* TPMI RAPL uses any CPU in the package for PMU */ + return topology_physical_package_id(cpu) == rp->id; +} + +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + + return &rp->pmu_data; +} + +/* PMU event callbacks */ + +static u64 event_read_counter(struct perf_event *event) +{ + struct rapl_package *rp = event->pmu_private; + u64 val; + int ret; + + /* Return 0 for unsupported events */ + if (event->hw.idx < 0) + return 0; + + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + + /* Return 0 for failed read */ + if (ret) + return 0; + + return val; +} + +static void __rapl_pmu_event_start(struct perf_event *event) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &data->active_list); + + local64_set(&event->hw.prev_count, event_read_counter(event)); + if (++data->n_active == 1) + hrtimer_start(&data->hrtimer, data->timer_interval, + HRTIMER_MODE_REL_PINNED); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + __rapl_pmu_event_start(event); + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + + /* + * Follow the generic code to drain hwc->prev_count. + * The loop is not expected to run for multiple times. + */ + prev_raw_count = local64_read(&hwc->prev_count); + do { + new_raw_count = event_read_counter(event); + } while (!local64_try_cmpxchg(&hwc->prev_count, + &prev_raw_count, new_raw_count)); + + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + */ + delta = new_raw_count - prev_raw_count; + + /* + * Scale delta to smallest unit (2^-32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + sdelta = delta * data->scale[event->hw.flags]; + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + /* Mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(data->n_active <= 0); + if (--data->n_active == 0) + hrtimer_cancel(&data->hrtimer); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* Check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + raw_spin_unlock_irqrestore(&data->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_package_pmu_data *data = event_to_pmu_data(event); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + raw_spin_lock_irqsave(&data->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +/* RAPL PMU event ids, same as shown in sysfs */ +enum perf_rapl_events { + PERF_RAPL_PP0 = 1, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + PERF_RAPL_MAX +}; +#define RAPL_EVENT_MASK GENMASK(7, 0) + +static const int event_to_domain[PERF_RAPL_MAX] = { + [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, + [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE, + [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM, + [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1, + [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM, +}; + +static int rapl_pmu_event_init(struct perf_event *event) +{ + struct rapl_package *pos, *rp = NULL; + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int domain, idx; + + /* Only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Check for supported events only */ + if (!cfg || cfg >= PERF_RAPL_MAX) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + /* Find out which Package the event belongs to */ + list_for_each_entry(pos, &rapl_packages, plist) { + if (is_rp_pmu_cpu(pos, event->cpu)) { + rp = pos; + break; + } + } + if (!rp) + return -ENODEV; + + /* Find out which RAPL Domain the event belongs to */ + domain = event_to_domain[cfg]; + + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->pmu_private = rp; /* Which package */ + event->hw.flags = domain; /* Which domain */ + + event->hw.idx = -1; + /* Find out the index in rp->domains[] to get domain pointer */ + for (idx = 0; idx < rp->nr_domains; idx++) { + if (rp->domains[idx].id == domain) { + event->hw.idx = idx; + break; + } + } + + return 0; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_package_pmu_data *data = + container_of(hrtimer, struct rapl_package_pmu_data, hrtimer); + struct perf_event *event; + unsigned long flags; + + if (!data->n_active) + return HRTIMER_NORESTART; + + raw_spin_lock_irqsave(&data->lock, flags); + + list_for_each_entry(event, &data->active_list, active_entry) + rapl_event_update(event); + + raw_spin_unlock_irqrestore(&data->lock, flags); + + hrtimer_forward_now(hrtimer, data->timer_interval); + + return HRTIMER_RESTART; +} + +/* PMU sysfs attributes */ + +/* + * There are no default events, but we need to create "events" group (with + * empty attrs) before updating it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rapl_package *rp; + cpumask_var_t cpu_mask; + int cpu; + int ret; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + cpus_read_lock(); + + cpumask_clear(cpu_mask); + + /* Choose a cpu for each RAPL Package */ + list_for_each_entry(rp, &rapl_packages, plist) { + cpu = get_pmu_cpu(rp); + if (cpu < nr_cpu_ids) + cpumask_set_cpu(cpu, cpu_mask); + } + cpus_read_unlock(); + + ret = cpumap_print_to_pagebuf(true, buf, cpu_mask); + + free_cpumask_var(cpu_mask); + + return ret; +} + +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group pmu_cpumask_group = { + .attrs = pmu_cpumask_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); +static struct attribute *pmu_format_attr[] = { + &format_attr_event.attr, + NULL +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = pmu_format_attr, +}; + +static const struct attribute_group *pmu_attr_groups[] = { + &pmu_events_group, + &pmu_cpumask_group, + &pmu_format_group, + NULL +}; + +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ + .event_str = str, \ +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); + +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules"); + +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10"); + +#define RAPL_EVENT_GROUP(_name, domain) \ +static struct attribute *pmu_attr_##_name[] = { \ + &event_attr_rapl_##_name.attr.attr, \ + &event_attr_rapl_unit_##_name.attr.attr, \ + &event_attr_rapl_scale_##_name.attr.attr, \ + NULL \ +}; \ +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \ +{ \ + return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \ +} \ +static struct attribute_group pmu_group_##_name = { \ + .name = "events", \ + .attrs = pmu_attr_##_name, \ + .is_visible = is_visible_##_name, \ +} + +RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0); +RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE); +RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM); +RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1); +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM); + +static const struct attribute_group *pmu_attr_update[] = { + &pmu_group_cores, + &pmu_group_pkg, + &pmu_group_ram, + &pmu_group_gpu, + &pmu_group_psys, + NULL +}; + +static int rapl_pmu_update(struct rapl_package *rp) +{ + int ret = 0; + + /* Return if PMU already covers all events supported by current RAPL Package */ + if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map))) + goto end; + + /* Unregister previous registered PMU */ + if (rapl_pmu.registered) + perf_pmu_unregister(&rapl_pmu.pmu); + + rapl_pmu.registered = false; + rapl_pmu.domain_map |= rp->domain_map; + + memset(&rapl_pmu.pmu, 0, sizeof(struct pmu)); + rapl_pmu.pmu.attr_groups = pmu_attr_groups; + rapl_pmu.pmu.attr_update = pmu_attr_update; + rapl_pmu.pmu.task_ctx_nr = perf_invalid_context; + rapl_pmu.pmu.event_init = rapl_pmu_event_init; + rapl_pmu.pmu.add = rapl_pmu_event_add; + rapl_pmu.pmu.del = rapl_pmu_event_del; + rapl_pmu.pmu.start = rapl_pmu_event_start; + rapl_pmu.pmu.stop = rapl_pmu_event_stop; + rapl_pmu.pmu.read = rapl_pmu_event_read; + rapl_pmu.pmu.module = THIS_MODULE; + rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT; + ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1); + if (ret) { + pr_info("Failed to register PMU\n"); + return ret; + } + + rapl_pmu.registered = true; +end: + rp->has_pmu = true; + return ret; +} + +int rapl_package_add_pmu(struct rapl_package *rp) +{ + struct rapl_package_pmu_data *data = &rp->pmu_data; + int idx; + + if (rp->has_pmu) + return -EEXIST; + + guard(cpus_read_lock)(); + + for (idx = 0; idx < rp->nr_domains; idx++) { + struct rapl_domain *rd = &rp->domains[idx]; + int domain = rd->id; + u64 val; + + if (!test_bit(domain, &rp->domain_map)) + continue; + + /* + * The RAPL PMU granularity is 2^-32 Joules + * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase + */ + val = rd->energy_unit * (1ULL << 32); + do_div(val, ENERGY_UNIT_SCALE * 1000000); + data->scale[domain] = val; + + if (!rapl_pmu.timer_ms) { + struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER); + + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter + * overflows. + * + * max_count = rpi->mask >> rpi->shift + 1 + * max_energy_pj = max_count * rd->energy_unit + * max_time_sec = (max_energy_pj / 1000000000) / 200w + * + * rapl_pmu.timer_ms = max_time_sec * 1000 / 2 + */ + val = (rpi->mask >> rpi->shift) + 1; + val *= rd->energy_unit; + do_div(val, 1000000 * 200 * 2); + rapl_pmu.timer_ms = val; + + pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms); + } + + pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]); + } + + /* Initialize per package PMU data */ + raw_spin_lock_init(&data->lock); + INIT_LIST_HEAD(&data->active_list); + data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); + hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + data->hrtimer.function = rapl_hrtimer_handle; + + return rapl_pmu_update(rp); +} +EXPORT_SYMBOL_GPL(rapl_package_add_pmu); + +void rapl_package_remove_pmu(struct rapl_package *rp) +{ + struct rapl_package *pos; + + if (!rp->has_pmu) + return; + + guard(cpus_read_lock)(); + + list_for_each_entry(pos, &rapl_packages, plist) { + /* PMU is still needed */ + if (pos->has_pmu && pos != rp) + return; + } + + perf_pmu_unregister(&rapl_pmu.pmu); + memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); +} +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu); +#endif + /* called from CPU hotplug notifier, hotplug lock held */ void rapl_remove_package_cpuslocked(struct rapl_package *rp) { diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index f3196f82fd8a..c0397423d3a8 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -158,6 +158,26 @@ struct rapl_if_priv { void *rpi; }; +#ifdef CONFIG_PERF_EVENTS +/** + * struct rapl_package_pmu_data: Per package data for PMU support + * @scale: Scale of 2^-32 Joules for each energy counter increase. + * @lock: Lock to protect n_active and active_list. + * @n_active: Number of active events. + * @active_list: List of active events. + * @timer_interval: Maximum timer expiration time before counter overflow. + * @hrtimer: Periodically update the counter to prevent overflow. + */ +struct rapl_package_pmu_data { + u64 scale[RAPL_DOMAIN_MAX]; + raw_spinlock_t lock; + int n_active; + struct list_head active_list; + ktime_t timer_interval; + struct hrtimer hrtimer; +}; +#endif + /* maximum rapl package domain name: package-%d-die-%d */ #define PACKAGE_DOMAIN_NAME_LENGTH 30 @@ -176,6 +196,10 @@ struct rapl_package { struct cpumask cpumask; char name[PACKAGE_DOMAIN_NAME_LENGTH]; struct rapl_if_priv *priv; +#ifdef CONFIG_PERF_EVENTS + bool has_pmu; + struct rapl_package_pmu_data pmu_data; +#endif }; struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, @@ -188,4 +212,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +#ifdef CONFIG_PERF_EVENTS +int rapl_package_add_pmu(struct rapl_package *rp); +void rapl_package_remove_pmu(struct rapl_package *rp); +#else +static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; } +static inline void rapl_package_remove_pmu(struct rapl_package *rp) { } +#endif + #endif /* __INTEL_RAPL_H__ */ -- cgit