diff options
-rw-r--r-- | Documentation/thermal/sysfs-api.txt | 21 | ||||
-rw-r--r-- | drivers/thermal/Kconfig | 17 | ||||
-rw-r--r-- | drivers/thermal/devfreq_cooling.c | 152 | ||||
-rw-r--r-- | drivers/thermal/intel_soc_dts_thermal.c | 9 | ||||
-rw-r--r-- | drivers/thermal/thermal_core.c | 64 | ||||
-rw-r--r-- | include/linux/devfreq_cooling.h | 19 | ||||
-rw-r--r-- | include/trace/events/thermal.h | 11 |
7 files changed, 244 insertions, 49 deletions
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt index ef473dc7f55e..bb9a0a53e76b 100644 --- a/Documentation/thermal/sysfs-api.txt +++ b/Documentation/thermal/sysfs-api.txt @@ -582,3 +582,24 @@ platform data is provided, this uses the step_wise throttling policy. This function serves as an arbitrator to set the state of a cooling device. It sets the cooling device to the deepest cooling state if possible. + +6. thermal_emergency_poweroff: + +On an event of critical trip temperature crossing. Thermal framework +allows the system to shutdown gracefully by calling orderly_poweroff(). +In the event of a failure of orderly_poweroff() to shut down the system +we are in danger of keeping the system alive at undesirably high +temperatures. To mitigate this high risk scenario we program a work +queue to fire after a pre-determined number of seconds to start +an emergency shutdown of the device using the kernel_power_off() +function. In case kernel_power_off() fails then finally +emergency_restart() is called in the worst case. + +The delay should be carefully profiled so as to give adequate time for +orderly_poweroff(). In case of failure of an orderly_poweroff() the +emergency poweroff kicks in after the delay has elapsed and shuts down +the system. + +If set to 0 emergency poweroff will not be supported. So a carefully +profiled non-zero positive value is a must for emergerncy poweroff to be +triggered. diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index f786ae433032..4edc011fe4a5 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -15,6 +15,23 @@ menuconfig THERMAL if THERMAL +config THERMAL_EMERGENCY_POWEROFF_DELAY_MS + int "Emergency poweroff delay in milli-seconds" + depends on THERMAL + default 0 + help + Thermal subsystem will issue a graceful shutdown when + critical temperatures are reached using orderly_poweroff(). In + case of failure of an orderly_poweroff(), the thermal emergency + poweroff kicks in after a delay has elapsed and shuts down the system. + This config is number of milliseconds to delay before emergency + poweroff kicks in. Similarly to the critical trip point, + the delay should be carefully profiled so as to give adequate + time for orderly_poweroff() to finish on regular execution. + If set to 0 emergency poweroff will not be supported. + + In doubt, leave as 0. + config THERMAL_HWMON bool prompt "Expose thermal sensors as hwmon device" diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index 4bf4ad58cffd..ef59256887ff 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -28,6 +28,8 @@ #include <trace/events/thermal.h> +#define SCALE_ERROR_MITIGATION 100 + static DEFINE_IDA(devfreq_ida); /** @@ -45,6 +47,12 @@ static DEFINE_IDA(devfreq_ida); * @freq_table_size: Size of the @freq_table and @power_table * @power_ops: Pointer to devfreq_cooling_power, used to generate the * @power_table. + * @res_util: Resource utilization scaling factor for the power. + * It is multiplied by 100 to minimize the error. It is used + * for estimation of the power budget instead of using + * 'utilization' (which is 'busy_time / 'total_time'). + * The 'res_util' range is from 100 to (power_table[state] * 100) + * for the corresponding 'state'. */ struct devfreq_cooling_device { int id; @@ -55,6 +63,8 @@ struct devfreq_cooling_device { u32 *freq_table; size_t freq_table_size; struct devfreq_cooling_power *power_ops; + u32 res_util; + int capped_state; }; /** @@ -164,27 +174,12 @@ freq_get_state(struct devfreq_cooling_device *dfc, unsigned long freq) return THERMAL_CSTATE_INVALID; } -/** - * get_static_power() - calculate the static power - * @dfc: Pointer to devfreq cooling device - * @freq: Frequency in Hz - * - * Calculate the static power in milliwatts using the supplied - * get_static_power(). The current voltage is calculated using the - * OPP library. If no get_static_power() was supplied, assume the - * static power is negligible. - */ -static unsigned long -get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq) +static unsigned long get_voltage(struct devfreq *df, unsigned long freq) { - struct devfreq *df = dfc->devfreq; struct device *dev = df->dev.parent; unsigned long voltage; struct dev_pm_opp *opp; - if (!dfc->power_ops->get_static_power) - return 0; - opp = dev_pm_opp_find_freq_exact(dev, freq, true); if (PTR_ERR(opp) == -ERANGE) opp = dev_pm_opp_find_freq_exact(dev, freq, false); @@ -202,9 +197,35 @@ get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq) dev_err_ratelimited(dev, "Failed to get voltage for frequency %lu\n", freq); - return 0; } + return voltage; +} + +/** + * get_static_power() - calculate the static power + * @dfc: Pointer to devfreq cooling device + * @freq: Frequency in Hz + * + * Calculate the static power in milliwatts using the supplied + * get_static_power(). The current voltage is calculated using the + * OPP library. If no get_static_power() was supplied, assume the + * static power is negligible. + */ +static unsigned long +get_static_power(struct devfreq_cooling_device *dfc, unsigned long freq) +{ + struct devfreq *df = dfc->devfreq; + unsigned long voltage; + + if (!dfc->power_ops->get_static_power) + return 0; + + voltage = get_voltage(df, freq); + + if (voltage == 0) + return 0; + return dfc->power_ops->get_static_power(df, voltage); } @@ -239,6 +260,16 @@ get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq, return power; } + +static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc, + unsigned long freq, + unsigned long voltage) +{ + return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq, + voltage); +} + + static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev, struct thermal_zone_device *tz, u32 *power) @@ -248,27 +279,55 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd struct devfreq_dev_status *status = &df->last_status; unsigned long state; unsigned long freq = status->current_frequency; - u32 dyn_power, static_power; + unsigned long voltage; + u32 dyn_power = 0; + u32 static_power = 0; + int res; - /* Get dynamic power for state */ state = freq_get_state(dfc, freq); - if (state == THERMAL_CSTATE_INVALID) - return -EAGAIN; + if (state == THERMAL_CSTATE_INVALID) { + res = -EAGAIN; + goto fail; + } - dyn_power = dfc->power_table[state]; + if (dfc->power_ops->get_real_power) { + voltage = get_voltage(df, freq); + if (voltage == 0) { + res = -EINVAL; + goto fail; + } - /* Scale dynamic power for utilization */ - dyn_power = (dyn_power * status->busy_time) / status->total_time; + res = dfc->power_ops->get_real_power(df, power, freq, voltage); + if (!res) { + state = dfc->capped_state; + dfc->res_util = dfc->power_table[state]; + dfc->res_util *= SCALE_ERROR_MITIGATION; - /* Get static power */ - static_power = get_static_power(dfc, freq); + if (*power > 1) + dfc->res_util /= *power; + } else { + goto fail; + } + } else { + dyn_power = dfc->power_table[state]; - trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power, - static_power); + /* Scale dynamic power for utilization */ + dyn_power *= status->busy_time; + dyn_power /= status->total_time; + /* Get static power */ + static_power = get_static_power(dfc, freq); - *power = dyn_power + static_power; + *power = dyn_power + static_power; + } + + trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power, + static_power, *power); return 0; +fail: + /* It is safe to set max in this case */ + dfc->res_util = SCALE_ERROR_MITIGATION; + return res; } static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev, @@ -301,26 +360,34 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev, unsigned long busy_time; s32 dyn_power; u32 static_power; + s32 est_power; int i; - static_power = get_static_power(dfc, freq); + if (dfc->power_ops->get_real_power) { + /* Scale for resource utilization */ + est_power = power * dfc->res_util; + est_power /= SCALE_ERROR_MITIGATION; + } else { + static_power = get_static_power(dfc, freq); - dyn_power = power - static_power; - dyn_power = dyn_power > 0 ? dyn_power : 0; + dyn_power = power - static_power; + dyn_power = dyn_power > 0 ? dyn_power : 0; - /* Scale dynamic power for utilization */ - busy_time = status->busy_time ?: 1; - dyn_power = (dyn_power * status->total_time) / busy_time; + /* Scale dynamic power for utilization */ + busy_time = status->busy_time ?: 1; + est_power = (dyn_power * status->total_time) / busy_time; + } /* * Find the first cooling state that is within the power * budget for dynamic power. */ for (i = 0; i < dfc->freq_table_size - 1; i++) - if (dyn_power >= dfc->power_table[i]) + if (est_power >= dfc->power_table[i]) break; *state = i; + dfc->capped_state = i; trace_thermal_power_devfreq_limit(cdev, freq, *state, power); return 0; } @@ -376,7 +443,7 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc) } for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) { - unsigned long power_dyn, voltage; + unsigned long power, voltage; struct dev_pm_opp *opp; opp = dev_pm_opp_find_freq_floor(dev, &freq); @@ -389,12 +456,15 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc) dev_pm_opp_put(opp); if (dfc->power_ops) { - power_dyn = get_dynamic_power(dfc, freq, voltage); + if (dfc->power_ops->get_real_power) + power = get_total_power(dfc, freq, voltage); + else + power = get_dynamic_power(dfc, freq, voltage); - dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n", - freq / 1000000, voltage, power_dyn, power_dyn); + dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n", + freq / 1000000, voltage, power, power); - power_table[i] = power_dyn; + power_table[i] = power; } freq_table[i] = freq; diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c index b2bbaa1c60b0..c27868b2c6af 100644 --- a/drivers/thermal/intel_soc_dts_thermal.c +++ b/drivers/thermal/intel_soc_dts_thermal.c @@ -73,8 +73,12 @@ static int __init intel_soc_thermal_init(void) IRQF_TRIGGER_RISING | IRQF_ONESHOT, "soc_dts", soc_dts); if (err) { - pr_err("request_threaded_irq ret %d\n", err); - goto error_irq; + /* + * Do not just error out because the user space thermal + * daemon such as DPTF may use polling instead of being + * interrupt driven. + */ + pr_warn("request_threaded_irq ret %d\n", err); } } @@ -88,7 +92,6 @@ static int __init intel_soc_thermal_init(void) error_trips: if (soc_dts_thres_irq) free_irq(soc_dts_thres_irq, soc_dts); -error_irq: intel_soc_dts_iosf_exit(soc_dts); return err; diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 11f0675cb7e5..b21b9cc2c8d6 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -45,8 +45,10 @@ static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); +static DEFINE_MUTEX(poweroff_lock); static atomic_t in_suspend; +static bool power_off_triggered; static struct thermal_governor *def_governor; @@ -322,6 +324,54 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz, def_governor->throttle(tz, trip); } +/** + * thermal_emergency_poweroff_func - emergency poweroff work after a known delay + * @work: work_struct associated with the emergency poweroff function + * + * This function is called in very critical situations to force + * a kernel poweroff after a configurable timeout value. + */ +static void thermal_emergency_poweroff_func(struct work_struct *work) +{ + /* + * We have reached here after the emergency thermal shutdown + * Waiting period has expired. This means orderly_poweroff has + * not been able to shut off the system for some reason. + * Try to shut down the system immediately using kernel_power_off + * if populated + */ + WARN(1, "Attempting kernel_power_off: Temperature too high\n"); + kernel_power_off(); + + /* + * Worst of the worst case trigger emergency restart + */ + WARN(1, "Attempting emergency_restart: Temperature too high\n"); + emergency_restart(); +} + +static DECLARE_DELAYED_WORK(thermal_emergency_poweroff_work, + thermal_emergency_poweroff_func); + +/** + * thermal_emergency_poweroff - Trigger an emergency system poweroff + * + * This may be called from any critical situation to trigger a system shutdown + * after a known period of time. By default this is not scheduled. + */ +void thermal_emergency_poweroff(void) +{ + int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; + /* + * poweroff_delay_ms must be a carefully profiled positive value. + * Its a must for thermal_emergency_poweroff_work to be scheduled + */ + if (poweroff_delay_ms <= 0) + return; + schedule_delayed_work(&thermal_emergency_poweroff_work, + msecs_to_jiffies(poweroff_delay_ms)); +} + static void handle_critical_trips(struct thermal_zone_device *tz, int trip, enum thermal_trip_type trip_type) { @@ -342,7 +392,17 @@ static void handle_critical_trips(struct thermal_zone_device *tz, dev_emerg(&tz->device, "critical temperature reached(%d C),shutting down\n", tz->temperature / 1000); - orderly_poweroff(true); + mutex_lock(&poweroff_lock); + if (!power_off_triggered) { + /* + * Queue a backup emergency shutdown in the event of + * orderly_poweroff failure + */ + thermal_emergency_poweroff(); + orderly_poweroff(true); + power_off_triggered = true; + } + mutex_unlock(&poweroff_lock); } } @@ -1463,6 +1523,7 @@ static int __init thermal_init(void) { int result; + mutex_init(&poweroff_lock); result = thermal_register_governors(); if (result) goto error; @@ -1497,6 +1558,7 @@ error: ida_destroy(&thermal_cdev_ida); mutex_destroy(&thermal_list_lock); mutex_destroy(&thermal_governor_lock); + mutex_destroy(&poweroff_lock); return result; } diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h index c35d0c0e0ada..4635f95000a4 100644 --- a/include/linux/devfreq_cooling.h +++ b/include/linux/devfreq_cooling.h @@ -34,6 +34,23 @@ * If get_dynamic_power() is NULL, then the * dynamic power is calculated as * @dyn_power_coeff * frequency * voltage^2 + * @get_real_power: When this is set, the framework uses it to ask the + * device driver for the actual power. + * Some devices have more sophisticated methods + * (like power counters) to approximate the actual power + * that they use. + * This function provides more accurate data to the + * thermal governor. When the driver does not provide + * such function, framework just uses pre-calculated + * table and scale the power by 'utilization' + * (based on 'busy_time' and 'total_time' taken from + * devfreq 'last_status'). + * The value returned by this function must be lower + * or equal than the maximum power value + * for the current state + * (which can be found in power_table[state]). + * When this interface is used, the power_table holds + * max total (static + dynamic) power value for each OPP. */ struct devfreq_cooling_power { unsigned long (*get_static_power)(struct devfreq *devfreq, @@ -41,6 +58,8 @@ struct devfreq_cooling_power { unsigned long (*get_dynamic_power)(struct devfreq *devfreq, unsigned long freq, unsigned long voltage); + int (*get_real_power)(struct devfreq *df, u32 *power, + unsigned long freq, unsigned long voltage); unsigned long dyn_power_coeff; }; diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 2b4a8ff72d0d..6cde5b3514c2 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -151,9 +151,9 @@ TRACE_EVENT(thermal_power_cpu_limit, TRACE_EVENT(thermal_power_devfreq_get_power, TP_PROTO(struct thermal_cooling_device *cdev, struct devfreq_dev_status *status, unsigned long freq, - u32 dynamic_power, u32 static_power), + u32 dynamic_power, u32 static_power, u32 power), - TP_ARGS(cdev, status, freq, dynamic_power, static_power), + TP_ARGS(cdev, status, freq, dynamic_power, static_power, power), TP_STRUCT__entry( __string(type, cdev->type ) @@ -161,6 +161,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power, __field(u32, load ) __field(u32, dynamic_power ) __field(u32, static_power ) + __field(u32, power) ), TP_fast_assign( @@ -169,11 +170,13 @@ TRACE_EVENT(thermal_power_devfreq_get_power, __entry->load = (100 * status->busy_time) / status->total_time; __entry->dynamic_power = dynamic_power; __entry->static_power = static_power; + __entry->power = power; ), - TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u", + TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u power=%u", __get_str(type), __entry->freq, - __entry->load, __entry->dynamic_power, __entry->static_power) + __entry->load, __entry->dynamic_power, __entry->static_power, + __entry->power) ); TRACE_EVENT(thermal_power_devfreq_limit, |