diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/inode.c | 7 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 24 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 76 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 29 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 48 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 530 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 | ||||
-rw-r--r-- | kernel/trace/power-traces.c | 1 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 9 |
10 files changed, 671 insertions, 62 deletions
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..8f94ca1860cf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adc5e4bd74f8..cf5e9f7ad13a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db2574e7b8b0..c5c17a62f509 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -239,16 +239,6 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; -static const struct { - int map_type; - int func_id; -} func_limit[] = { - {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, - {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, -}; - static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -921,27 +911,52 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { - bool bool_map, bool_func; - int i; - if (!map) return 0; - for (i = 0; i < ARRAY_SIZE(func_limit); i++) { - bool_map = (map->map_type == func_limit[i].map_type); - bool_func = (func_id == func_limit[i].func_id); - /* only when map & func pair match it can continue. - * don't allow any other map type to be passed into - * the special func; - */ - if (bool_func && bool_map != bool_func) { - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); - return -EINVAL; - } + /* We need a two way check, first is from map perspective ... */ + switch (map->map_type) { + case BPF_MAP_TYPE_PROG_ARRAY: + if (func_id != BPF_FUNC_tail_call) + goto error; + break; + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + if (func_id != BPF_FUNC_perf_event_read && + func_id != BPF_FUNC_perf_event_output) + goto error; + break; + case BPF_MAP_TYPE_STACK_TRACE: + if (func_id != BPF_FUNC_get_stackid) + goto error; + break; + default: + break; + } + + /* ... and second from the function itself. */ + switch (func_id) { + case BPF_FUNC_tail_call: + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + goto error; + break; + case BPF_FUNC_perf_event_read: + case BPF_FUNC_perf_event_output: + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + goto error; + break; + case BPF_FUNC_get_stackid: + if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) + goto error; + break; + default: + break; } return 0; +error: + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); + return -EINVAL; } static int check_call(struct verifier_env *env, int func_id) @@ -2049,15 +2064,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 414d9c16da42..5e59b832ae2b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b489fcac37b..d1f7149f8704 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -596,17 +596,8 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * FIFO realtime policy runs the highest priority task (after DEADLINE). - * Other runnable tasks are of a lower priority. The scheduler tick - * isn't needed. - */ - fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; - if (fifo_nr_running) - return true; - - /* - * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. + * If there are more than one RR tasks, we need the tick to effect the + * actual RR behaviour. */ if (rq->rt.rr_nr_running) { if (rq->rt.rr_nr_running == 1) @@ -615,8 +606,20 @@ bool sched_can_stop_tick(struct rq *rq) return false; } - /* Normal multitasking need periodic preemption checks */ - if (rq->cfs.nr_running > 1) + /* + * If there's no RR tasks, but FIFO tasks, we can skip the tick, no + * forced preemption between FIFO tasks. + */ + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) + return true; + + /* + * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; + * if there's more than one we need the tick for involuntary + * preemption. + */ + if (rq->nr_running > 1) return false; return true; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 928c4ba32f68..1141954e73b4 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -14,24 +14,50 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); /** - * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. * @cpu: The CPU to set the pointer for. * @data: New pointer value. + * @func: Callback function to set for the CPU. * - * Set and publish the update_util_data pointer for the given CPU. That pointer - * points to a struct update_util_data object containing a callback function - * to call from cpufreq_update_util(). That function will be called from an RCU - * read-side critical section, so it must not sleep. + * Set and publish the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() - * right after this function to avoid use-after-free. + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. */ -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned long util, unsigned long max)) { - if (WARN_ON(data && !data->func)) + if (WARN_ON(!data || !func)) return; + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); } -EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..154ae3a51e86 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,530 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <trace/events/power.h> + +#include "sched.h" + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct work_struct work; + struct mutex work_lock; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + u64 last_update; +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + sg_policy->last_freq_update_time = time; + + if (policy->fast_switch_enabled) { + if (sg_policy->next_freq == next_freq) { + trace_cpu_frequency(policy->cur, smp_processor_id()); + return; + } + sg_policy->next_freq = next_freq; + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else if (sg_policy->next_freq != next_freq) { + sg_policy->next_freq = next_freq; + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @policy: cpufreq policy object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + */ +static unsigned int get_next_freq(struct cpufreq_policy *policy, + unsigned long util, unsigned long max) +{ + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + return (freq + (freq >> 2)) * util / max; +} + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int next_f; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : + get_next_freq(policy, util, max); + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) +{ + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int max_f = policy->cpuinfo.max_freq; + u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned int j; + + if (util == ULONG_MAX) + return max_f; + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu; + unsigned long j_util, j_max; + s64 delta_ns; + + if (j == smp_processor_id()) + continue; + + j_sg_cpu = &per_cpu(sugov_cpu, j); + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now. + */ + delta_ns = last_freq_update_time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) + continue; + + j_util = j_sg_cpu->util; + if (j_util == ULONG_MAX) + return max_f; + + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + } + + return get_next_freq(policy, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int next_f; + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_policy, util, max); + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct work_struct *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + schedule_work_on(smp_processor_id(), &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->rate_limit_us); +} + +static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, + size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) + sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + + return count; +} + +static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ + +static struct cpufreq_governor schedutil_gov; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + INIT_WORK(&sg_policy->work, sugov_work); + mutex_init(&sg_policy->work_lock); + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + unsigned int lat; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) + return -ENOMEM; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto free_sg_policy; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto free_sg_policy; + } + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + schedutil_gov.name); + if (ret) + goto fail; + + out: + mutex_unlock(&global_tunables_lock); + + cpufreq_enable_fast_switch(policy); + return 0; + + fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + + free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret); + return ret; +} + +static int sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + cpufreq_disable_fast_switch(policy); + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + return 0; +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + sg_cpu->sg_policy = sg_policy; + if (policy_is_shared(policy)) { + sg_cpu->util = ULONG_MAX; + sg_cpu->max = 0; + sg_cpu->last_update = 0; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_shared); + } else { + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_single); + } + } + return 0; +} + +static int sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + irq_work_sync(&sg_policy->irq_work); + cancel_work_sync(&sg_policy->work); + return 0; +} + +static int sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, + CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, + CPUFREQ_RELATION_L); + + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; + return 0; +} + +int sugov_governor(struct cpufreq_policy *policy, unsigned int event) +{ + if (event == CPUFREQ_GOV_POLICY_INIT) { + return sugov_init(policy); + } else if (policy->governor_data) { + switch (event) { + case CPUFREQ_GOV_POLICY_EXIT: + return sugov_exit(policy); + case CPUFREQ_GOV_START: + return sugov_start(policy); + case CPUFREQ_GOV_STOP: + return sugov_stop(policy); + case CPUFREQ_GOV_LIMITS: + return sugov_limits(policy); + } + } + return -EINVAL; +} + +static struct cpufreq_governor schedutil_gov = { + .name = "schedutil", + .governor = sugov_governor, + .owner = THIS_MODULE, +}; + +static int __init sugov_module_init(void) +{ + return cpufreq_register_governor(&schedutil_gov); +} + +static void __exit sugov_module_exit(void) +{ + cpufreq_unregister_governor(&schedutil_gov); +} + +MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); +MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +struct cpufreq_governor *cpufreq_default_governor(void) +{ + return &schedutil_gov; +} + +fs_initcall(sugov_module_init); +#else +module_init(sugov_module_init); +#endif +module_exit(sugov_module_exit); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e..921d6e5d33b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1842,6 +1842,14 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo static inline void cpufreq_trigger_update(u64 time) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif + static inline void account_reset_rq(struct rq *rq) { #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 81b87451c0ea..0c7dee221dca 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..6f965864cc02 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2095,8 +2095,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + /* + * Only event directories that can be enabled should have + * triggers. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); |