diff options
Diffstat (limited to 'drivers/cpufreq/intel_pstate.c')
-rw-r--r-- | drivers/cpufreq/intel_pstate.c | 254 |
1 files changed, 243 insertions, 11 deletions
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 17e566afbb41..ece120da3353 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -221,6 +221,11 @@ struct global_params { * preference/bias * @epp_saved: Saved EPP/EPB during system suspend or CPU offline * operation + * @hwp_req_cached: Cached value of the last HWP Request MSR + * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR + * @last_io_update: Last time when IO wake flag was set + * @sched_flags: Store scheduler flags for possible cross CPU update + * @hwp_boost_min: Last HWP boosted min performance * * This structure stores per CPU instance data for all CPUs. */ @@ -253,6 +258,11 @@ struct cpudata { s16 epp_policy; s16 epp_default; s16 epp_saved; + u64 hwp_req_cached; + u64 hwp_cap_cached; + u64 last_io_update; + unsigned int sched_flags; + u32 hwp_boost_min; }; static struct cpudata **all_cpu_data; @@ -284,7 +294,9 @@ struct pstate_funcs { static struct pstate_funcs pstate_funcs __read_mostly; static int hwp_active __read_mostly; +static int hwp_mode_bdw __read_mostly; static bool per_cpu_limits __read_mostly; +static bool hwp_boost __read_mostly; static struct cpufreq_driver *intel_pstate_driver __read_mostly; @@ -689,6 +701,7 @@ static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, u64 cap; rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); + WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap); if (global.no_turbo) *current_max = HWP_GUARANTEED_PERF(cap); else @@ -763,6 +776,7 @@ update_epp: intel_pstate_set_epb(cpu, epp); } skip_epp: + WRITE_ONCE(cpu_data->hwp_req_cached, value); wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } @@ -1020,6 +1034,30 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, return count; } +static ssize_t show_hwp_dynamic_boost(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", hwp_boost); +} + +static ssize_t store_hwp_dynamic_boost(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = kstrtouint(buf, 10, &input); + if (ret) + return ret; + + mutex_lock(&intel_pstate_driver_lock); + hwp_boost = !!input; + intel_pstate_update_policies(); + mutex_unlock(&intel_pstate_driver_lock); + + return count; +} + show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -1029,6 +1067,7 @@ define_one_global_rw(max_perf_pct); define_one_global_rw(min_perf_pct); define_one_global_ro(turbo_pct); define_one_global_ro(num_pstates); +define_one_global_rw(hwp_dynamic_boost); static struct attribute *intel_pstate_attributes[] = { &status.attr, @@ -1069,6 +1108,11 @@ static void __init intel_pstate_sysfs_expose_params(void) rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); WARN_ON(rc); + if (hwp_active) { + rc = sysfs_create_file(intel_pstate_kobject, + &hwp_dynamic_boost.attr); + WARN_ON(rc); + } } /************************** sysfs end ************************/ @@ -1370,7 +1414,15 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); cpu->pstate.scaling = pstate_funcs.get_scaling(); cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; - cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + + if (hwp_active && !hwp_mode_bdw) { + unsigned int phy_max, current_max; + + intel_pstate_get_hwp_max(cpu->cpu, &phy_max, ¤t_max); + cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; + } else { + cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + } if (pstate_funcs.get_aperf_mperf_shift) cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); @@ -1381,6 +1433,116 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) intel_pstate_set_min_pstate(cpu); } +/* + * Long hold time will keep high perf limits for long time, + * which negatively impacts perf/watt for some workloads, + * like specpower. 3ms is based on experiements on some + * workoads. + */ +static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; + +static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) +{ + u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); + u32 max_limit = (hwp_req & 0xff00) >> 8; + u32 min_limit = (hwp_req & 0xff); + u32 boost_level1; + + /* + * Cases to consider (User changes via sysfs or boot time): + * If, P0 (Turbo max) = P1 (Guaranteed max) = min: + * No boost, return. + * If, P0 (Turbo max) > P1 (Guaranteed max) = min: + * Should result in one level boost only for P0. + * If, P0 (Turbo max) = P1 (Guaranteed max) > min: + * Should result in two level boost: + * (min + p1)/2 and P1. + * If, P0 (Turbo max) > P1 (Guaranteed max) > min: + * Should result in three level boost: + * (min + p1)/2, P1 and P0. + */ + + /* If max and min are equal or already at max, nothing to boost */ + if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit) + return; + + if (!cpu->hwp_boost_min) + cpu->hwp_boost_min = min_limit; + + /* level at half way mark between min and guranteed */ + boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1; + + if (cpu->hwp_boost_min < boost_level1) + cpu->hwp_boost_min = boost_level1; + else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) + cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached); + else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) && + max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) + cpu->hwp_boost_min = max_limit; + else + return; + + hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min; + wrmsrl(MSR_HWP_REQUEST, hwp_req); + cpu->last_update = cpu->sample.time; +} + +static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) +{ + if (cpu->hwp_boost_min) { + bool expired; + + /* Check if we are idle for hold time to boost down */ + expired = time_after64(cpu->sample.time, cpu->last_update + + hwp_boost_hold_time_ns); + if (expired) { + wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached); + cpu->hwp_boost_min = 0; + } + } + cpu->last_update = cpu->sample.time; +} + +static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu, + u64 time) +{ + cpu->sample.time = time; + + if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { + bool do_io = false; + + cpu->sched_flags = 0; + /* + * Set iowait_boost flag and update time. Since IO WAIT flag + * is set all the time, we can't just conclude that there is + * some IO bound activity is scheduled on this CPU with just + * one occurrence. If we receive at least two in two + * consecutive ticks, then we treat as boost candidate. + */ + if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) + do_io = true; + + cpu->last_io_update = time; + + if (do_io) + intel_pstate_hwp_boost_up(cpu); + + } else { + intel_pstate_hwp_boost_down(cpu); + } +} + +static inline void intel_pstate_update_util_hwp(struct update_util_data *data, + u64 time, unsigned int flags) +{ + struct cpudata *cpu = container_of(data, struct cpudata, update_util); + + cpu->sched_flags |= flags; + + if (smp_processor_id() == cpu->cpu) + intel_pstate_update_util_hwp_local(cpu, time); +} + static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) { struct sample *sample = &cpu->sample; @@ -1641,6 +1803,12 @@ static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { {} }; +static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = { + ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs), + ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs), + {} +}; + static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; @@ -1671,6 +1839,10 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_disable_ee(cpunum); intel_pstate_hwp_enable(cpu); + + id = x86_match_cpu(intel_pstate_hwp_boost_ids); + if (id) + hwp_boost = true; } intel_pstate_get_cpu_pstates(cpu); @@ -1684,7 +1856,7 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { struct cpudata *cpu = all_cpu_data[cpu_num]; - if (hwp_active) + if (hwp_active && !hwp_boost) return; if (cpu->update_util_set) @@ -1693,7 +1865,9 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) /* Prevent intel_pstate_update_util() from using stale data. */ cpu->sample.time = 0; cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, - intel_pstate_update_util); + (hwp_active ? + intel_pstate_update_util_hwp : + intel_pstate_update_util)); cpu->update_util_set = true; } @@ -1805,8 +1979,16 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_set_update_util_hook(policy->cpu); } - if (hwp_active) + if (hwp_active) { + /* + * When hwp_boost was active before and dynamically it + * was turned off, in that case we need to clear the + * update util hook. + */ + if (!hwp_boost) + intel_pstate_clear_update_util_hook(policy->cpu); intel_pstate_hwp_set(policy->cpu); + } mutex_unlock(&intel_pstate_limits_lock); @@ -1939,13 +2121,51 @@ static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) return 0; } +/* Use of trace in passive mode: + * + * In passive mode the trace core_busy field (also known as the + * performance field, and lablelled as such on the graphs; also known as + * core_avg_perf) is not needed and so is re-assigned to indicate if the + * driver call was via the normal or fast switch path. Various graphs + * output from the intel_pstate_tracer.py utility that include core_busy + * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%, + * so we use 10 to indicate the the normal path through the driver, and + * 90 to indicate the fast switch path through the driver. + * The scaled_busy field is not used, and is set to 0. + */ + +#define INTEL_PSTATE_TRACE_TARGET 10 +#define INTEL_PSTATE_TRACE_FAST_SWITCH 90 + +static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, int old_pstate) +{ + struct sample *sample; + + if (!trace_pstate_sample_enabled()) + return; + + if (!intel_pstate_sample(cpu, ktime_get())) + return; + + sample = &cpu->sample; + trace_pstate_sample(trace_type, + 0, + old_pstate, + cpu->pstate.current_pstate, + sample->mperf, + sample->aperf, + sample->tsc, + get_avg_frequency(cpu), + fp_toint(cpu->iowait_boost * 100)); +} + static int intel_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { struct cpudata *cpu = all_cpu_data[policy->cpu]; struct cpufreq_freqs freqs; - int target_pstate; + int target_pstate, old_pstate; update_turbo_state(); @@ -1965,12 +2185,14 @@ static int intel_cpufreq_target(struct cpufreq_policy *policy, break; } target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + old_pstate = cpu->pstate.current_pstate; if (target_pstate != cpu->pstate.current_pstate) { cpu->pstate.current_pstate = target_pstate; wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, target_pstate)); } freqs.new = target_pstate * cpu->pstate.scaling; + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_TARGET, old_pstate); cpufreq_freq_transition_end(policy, &freqs, false); return 0; @@ -1980,13 +2202,15 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { struct cpudata *cpu = all_cpu_data[policy->cpu]; - int target_pstate; + int target_pstate, old_pstate; update_turbo_state(); target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + old_pstate = cpu->pstate.current_pstate; intel_pstate_update_pstate(cpu, target_pstate); + intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); return target_pstate * cpu->pstate.scaling; } @@ -2252,28 +2476,36 @@ static inline bool intel_pstate_has_acpi_ppc(void) { return false; } static inline void intel_pstate_request_control_from_smm(void) {} #endif /* CONFIG_ACPI */ +#define INTEL_PSTATE_HWP_BROADWELL 0x01 + +#define ICPU_HWP(model, hwp_mode) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_HWP, hwp_mode } + static const struct x86_cpu_id hwp_support_ids[] __initconst = { - { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP }, + ICPU_HWP(INTEL_FAM6_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), + ICPU_HWP(INTEL_FAM6_BROADWELL_XEON_D, INTEL_PSTATE_HWP_BROADWELL), + ICPU_HWP(X86_MODEL_ANY, 0), {} }; static int __init intel_pstate_init(void) { + const struct x86_cpu_id *id; int rc; if (no_load) return -ENODEV; - if (x86_match_cpu(hwp_support_ids)) { + id = x86_match_cpu(hwp_support_ids); + if (id) { copy_cpu_funcs(&core_funcs); if (!no_hwp) { hwp_active++; + hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; goto hwp_cpu_matched; } } else { - const struct x86_cpu_id *id; - id = x86_match_cpu(intel_pstate_cpu_ids); if (!id) return -ENODEV; @@ -2297,7 +2529,7 @@ hwp_cpu_matched: pr_info("Intel P-state driver initializing\n"); - all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); + all_cpu_data = vzalloc(array_size(sizeof(void *), num_possible_cpus())); if (!all_cpu_data) return -ENOMEM; |