aboutsummaryrefslogtreecommitdiff
path: root/drivers/cpufreq/intel_pstate.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/cpufreq/intel_pstate.c')
-rw-r--r--drivers/cpufreq/intel_pstate.c254
1 files changed, 243 insertions, 11 deletions
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 17e566afbb41..ece120da3353 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -221,6 +221,11 @@ struct global_params {
* preference/bias
* @epp_saved: Saved EPP/EPB during system suspend or CPU offline
* operation
+ * @hwp_req_cached: Cached value of the last HWP Request MSR
+ * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR
+ * @last_io_update: Last time when IO wake flag was set
+ * @sched_flags: Store scheduler flags for possible cross CPU update
+ * @hwp_boost_min: Last HWP boosted min performance
*
* This structure stores per CPU instance data for all CPUs.
*/
@@ -253,6 +258,11 @@ struct cpudata {
s16 epp_policy;
s16 epp_default;
s16 epp_saved;
+ u64 hwp_req_cached;
+ u64 hwp_cap_cached;
+ u64 last_io_update;
+ unsigned int sched_flags;
+ u32 hwp_boost_min;
};
static struct cpudata **all_cpu_data;
@@ -284,7 +294,9 @@ struct pstate_funcs {
static struct pstate_funcs pstate_funcs __read_mostly;
static int hwp_active __read_mostly;
+static int hwp_mode_bdw __read_mostly;
static bool per_cpu_limits __read_mostly;
+static bool hwp_boost __read_mostly;
static struct cpufreq_driver *intel_pstate_driver __read_mostly;
@@ -689,6 +701,7 @@ static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max,
u64 cap;
rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
+ WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap);
if (global.no_turbo)
*current_max = HWP_GUARANTEED_PERF(cap);
else
@@ -763,6 +776,7 @@ update_epp:
intel_pstate_set_epb(cpu, epp);
}
skip_epp:
+ WRITE_ONCE(cpu_data->hwp_req_cached, value);
wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
}
@@ -1020,6 +1034,30 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
return count;
}
+static ssize_t show_hwp_dynamic_boost(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", hwp_boost);
+}
+
+static ssize_t store_hwp_dynamic_boost(struct kobject *a, struct attribute *b,
+ const char *buf, size_t count)
+{
+ unsigned int input;
+ int ret;
+
+ ret = kstrtouint(buf, 10, &input);
+ if (ret)
+ return ret;
+
+ mutex_lock(&intel_pstate_driver_lock);
+ hwp_boost = !!input;
+ intel_pstate_update_policies();
+ mutex_unlock(&intel_pstate_driver_lock);
+
+ return count;
+}
+
show_one(max_perf_pct, max_perf_pct);
show_one(min_perf_pct, min_perf_pct);
@@ -1029,6 +1067,7 @@ define_one_global_rw(max_perf_pct);
define_one_global_rw(min_perf_pct);
define_one_global_ro(turbo_pct);
define_one_global_ro(num_pstates);
+define_one_global_rw(hwp_dynamic_boost);
static struct attribute *intel_pstate_attributes[] = {
&status.attr,
@@ -1069,6 +1108,11 @@ static void __init intel_pstate_sysfs_expose_params(void)
rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
WARN_ON(rc);
+ if (hwp_active) {
+ rc = sysfs_create_file(intel_pstate_kobject,
+ &hwp_dynamic_boost.attr);
+ WARN_ON(rc);
+ }
}
/************************** sysfs end ************************/
@@ -1370,7 +1414,15 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
cpu->pstate.scaling = pstate_funcs.get_scaling();
cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
- cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+
+ if (hwp_active && !hwp_mode_bdw) {
+ unsigned int phy_max, current_max;
+
+ intel_pstate_get_hwp_max(cpu->cpu, &phy_max, &current_max);
+ cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling;
+ } else {
+ cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+ }
if (pstate_funcs.get_aperf_mperf_shift)
cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
@@ -1381,6 +1433,116 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
intel_pstate_set_min_pstate(cpu);
}
+/*
+ * Long hold time will keep high perf limits for long time,
+ * which negatively impacts perf/watt for some workloads,
+ * like specpower. 3ms is based on experiements on some
+ * workoads.
+ */
+static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC;
+
+static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu)
+{
+ u64 hwp_req = READ_ONCE(cpu->hwp_req_cached);
+ u32 max_limit = (hwp_req & 0xff00) >> 8;
+ u32 min_limit = (hwp_req & 0xff);
+ u32 boost_level1;
+
+ /*
+ * Cases to consider (User changes via sysfs or boot time):
+ * If, P0 (Turbo max) = P1 (Guaranteed max) = min:
+ * No boost, return.
+ * If, P0 (Turbo max) > P1 (Guaranteed max) = min:
+ * Should result in one level boost only for P0.
+ * If, P0 (Turbo max) = P1 (Guaranteed max) > min:
+ * Should result in two level boost:
+ * (min + p1)/2 and P1.
+ * If, P0 (Turbo max) > P1 (Guaranteed max) > min:
+ * Should result in three level boost:
+ * (min + p1)/2, P1 and P0.
+ */
+
+ /* If max and min are equal or already at max, nothing to boost */
+ if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit)
+ return;
+
+ if (!cpu->hwp_boost_min)
+ cpu->hwp_boost_min = min_limit;
+
+ /* level at half way mark between min and guranteed */
+ boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1;
+
+ if (cpu->hwp_boost_min < boost_level1)
+ cpu->hwp_boost_min = boost_level1;
+ else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached))
+ cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached);
+ else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) &&
+ max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached))
+ cpu->hwp_boost_min = max_limit;
+ else
+ return;
+
+ hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min;
+ wrmsrl(MSR_HWP_REQUEST, hwp_req);
+ cpu->last_update = cpu->sample.time;
+}
+
+static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu)
+{
+ if (cpu->hwp_boost_min) {
+ bool expired;
+
+ /* Check if we are idle for hold time to boost down */
+ expired = time_after64(cpu->sample.time, cpu->last_update +
+ hwp_boost_hold_time_ns);
+ if (expired) {
+ wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached);
+ cpu->hwp_boost_min = 0;
+ }
+ }
+ cpu->last_update = cpu->sample.time;
+}
+
+static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu,
+ u64 time)
+{
+ cpu->sample.time = time;
+
+ if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
+ bool do_io = false;
+
+ cpu->sched_flags = 0;
+ /*
+ * Set iowait_boost flag and update time. Since IO WAIT flag
+ * is set all the time, we can't just conclude that there is
+ * some IO bound activity is scheduled on this CPU with just
+ * one occurrence. If we receive at least two in two
+ * consecutive ticks, then we treat as boost candidate.
+ */
+ if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
+ do_io = true;
+
+ cpu->last_io_update = time;
+
+ if (do_io)
+ intel_pstate_hwp_boost_up(cpu);
+
+ } else {
+ intel_pstate_hwp_boost_down(cpu);
+ }
+}
+
+static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
+ u64 time, unsigned int flags)
+{
+ struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+
+ cpu->sched_flags |= flags;
+
+ if (smp_processor_id() == cpu->cpu)
+ intel_pstate_update_util_hwp_local(cpu, time);
+}
+
static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu)
{
struct sample *sample = &cpu->sample;
@@ -1641,6 +1803,12 @@ static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
{}
};
+static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = {
+ ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs),
+ ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs),
+ {}
+};
+
static int intel_pstate_init_cpu(unsigned int cpunum)
{
struct cpudata *cpu;
@@ -1671,6 +1839,10 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
intel_pstate_disable_ee(cpunum);
intel_pstate_hwp_enable(cpu);
+
+ id = x86_match_cpu(intel_pstate_hwp_boost_ids);
+ if (id)
+ hwp_boost = true;
}
intel_pstate_get_cpu_pstates(cpu);
@@ -1684,7 +1856,7 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
{
struct cpudata *cpu = all_cpu_data[cpu_num];
- if (hwp_active)
+ if (hwp_active && !hwp_boost)
return;
if (cpu->update_util_set)
@@ -1693,7 +1865,9 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
/* Prevent intel_pstate_update_util() from using stale data. */
cpu->sample.time = 0;
cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
- intel_pstate_update_util);
+ (hwp_active ?
+ intel_pstate_update_util_hwp :
+ intel_pstate_update_util));
cpu->update_util_set = true;
}
@@ -1805,8 +1979,16 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
intel_pstate_set_update_util_hook(policy->cpu);
}
- if (hwp_active)
+ if (hwp_active) {
+ /*
+ * When hwp_boost was active before and dynamically it
+ * was turned off, in that case we need to clear the
+ * update util hook.
+ */
+ if (!hwp_boost)
+ intel_pstate_clear_update_util_hook(policy->cpu);
intel_pstate_hwp_set(policy->cpu);
+ }
mutex_unlock(&intel_pstate_limits_lock);
@@ -1939,13 +2121,51 @@ static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
return 0;
}
+/* Use of trace in passive mode:
+ *
+ * In passive mode the trace core_busy field (also known as the
+ * performance field, and lablelled as such on the graphs; also known as
+ * core_avg_perf) is not needed and so is re-assigned to indicate if the
+ * driver call was via the normal or fast switch path. Various graphs
+ * output from the intel_pstate_tracer.py utility that include core_busy
+ * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%,
+ * so we use 10 to indicate the the normal path through the driver, and
+ * 90 to indicate the fast switch path through the driver.
+ * The scaled_busy field is not used, and is set to 0.
+ */
+
+#define INTEL_PSTATE_TRACE_TARGET 10
+#define INTEL_PSTATE_TRACE_FAST_SWITCH 90
+
+static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate)
+{
+ struct sample *sample;
+
+ if (!trace_pstate_sample_enabled())
+ return;
+
+ if (!intel_pstate_sample(cpu, ktime_get()))
+ return;
+
+ sample = &cpu->sample;
+ trace_pstate_sample(trace_type,
+ 0,
+ old_pstate,
+ cpu->pstate.current_pstate,
+ sample->mperf,
+ sample->aperf,
+ sample->tsc,
+ get_avg_frequency(cpu),
+ fp_toint(cpu->iowait_boost * 100));
+}
+
static int intel_cpufreq_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
struct cpudata *cpu = all_cpu_data[policy->cpu];
struct cpufreq_freqs freqs;
- int target_pstate;
+ int target_pstate, old_pstate;
update_turbo_state();
@@ -1965,12 +2185,14 @@ static int intel_cpufreq_target(struct cpufreq_policy *policy,
break;
}
target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+ old_pstate = cpu->pstate.current_pstate;
if (target_pstate != cpu->pstate.current_pstate) {
cpu->pstate.current_pstate = target_pstate;
wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
pstate_funcs.get_val(cpu, target_pstate));
}
freqs.new = target_pstate * cpu->pstate.scaling;
+ intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate);
cpufreq_freq_transition_end(policy, &freqs, false);
return 0;
@@ -1980,13 +2202,15 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
unsigned int target_freq)
{
struct cpudata *cpu = all_cpu_data[policy->cpu];
- int target_pstate;
+ int target_pstate, old_pstate;
update_turbo_state();
target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+ old_pstate = cpu->pstate.current_pstate;
intel_pstate_update_pstate(cpu, target_pstate);
+ intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
return target_pstate * cpu->pstate.scaling;
}
@@ -2252,28 +2476,36 @@ static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
static inline void intel_pstate_request_control_from_smm(void) {}
#endif /* CONFIG_ACPI */
+#define INTEL_PSTATE_HWP_BROADWELL 0x01
+
+#define ICPU_HWP(model, hwp_mode) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_HWP, hwp_mode }
+
static const struct x86_cpu_id hwp_support_ids[] __initconst = {
- { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
+ ICPU_HWP(INTEL_FAM6_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL),
+ ICPU_HWP(INTEL_FAM6_BROADWELL_XEON_D, INTEL_PSTATE_HWP_BROADWELL),
+ ICPU_HWP(X86_MODEL_ANY, 0),
{}
};
static int __init intel_pstate_init(void)
{
+ const struct x86_cpu_id *id;
int rc;
if (no_load)
return -ENODEV;
- if (x86_match_cpu(hwp_support_ids)) {
+ id = x86_match_cpu(hwp_support_ids);
+ if (id) {
copy_cpu_funcs(&core_funcs);
if (!no_hwp) {
hwp_active++;
+ hwp_mode_bdw = id->driver_data;
intel_pstate.attr = hwp_cpufreq_attrs;
goto hwp_cpu_matched;
}
} else {
- const struct x86_cpu_id *id;
-
id = x86_match_cpu(intel_pstate_cpu_ids);
if (!id)
return -ENODEV;
@@ -2297,7 +2529,7 @@ hwp_cpu_matched:
pr_info("Intel P-state driver initializing\n");
- all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
+ all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus()));
if (!all_cpu_data)
return -ENOMEM;