From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..2ead22dd74a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -87,7 +87,8 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @start_pid: timer statistics field to store the pid of the task which + * @is_rel: Set if the timer was armed relative + * @start_pid: timer statistics field to store the pid of the task which * started the timer * @start_site: timer statistics field to store the site where the timer * was started @@ -101,7 +102,8 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; - unsigned long state; + u8 state; + u8 is_rel; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { } #endif +static inline ktime_t +__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) +{ + ktime_t rem = ktime_sub(timer->node.expires, now); + + /* + * Adjust relative timers for the extra we added in + * hrtimer_start_range_ns() to prevent short timeouts. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) + rem.tv64 -= hrtimer_resolution; + return rem; +} + +static inline ktime_t +hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) +{ + return __hrtimer_expires_remaining_adjusted(timer, + timer->base->get_time()); +} + extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); @@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer) } /* Query timers: */ -extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); +extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); + +static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + return __hrtimer_get_remaining(timer, false); +} extern u64 hrtimer_get_next_event(void); -- cgit From 386744425e35e04984c6e741c7750fd6eef1a9df Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 1 Jul 2015 14:17:58 +0200 Subject: swiotlb: Make linux/swiotlb.h standalone includible This header file uses the enum dma_data_direction and struct page types without explicitly including the corresponding header files. This makes it rely on the includer to have included the proper headers before. To fix this, include linux/dma-direction.h and forward-declare struct page. The swiotlb_free() function is also annotated __init, therefore requires linux/init.h to be included as well. Signed-off-by: Thierry Reding Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index e7a018eaf3a2..017fced60242 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -1,10 +1,13 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include +#include #include struct device; struct dma_attrs; +struct page; struct scatterlist; extern int swiotlb_force; -- cgit From fae3fde65138b6071b1b0e0b567d4058a8b6a88c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:00:50 +0100 Subject: perf: Collapse and fix event_function_call() users There is one common bug left in all the event_function_call() users, between loading ctx->task and getting to the remote_function(), ctx->task can already have been changed. Therefore we need to double check and retry if ctx->task != current. Insert another trampoline specific to event_function_call() that checks for this and further validates state. This also allows getting rid of the active/inactive functions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 365 +++++++++++++++++++----------------------- kernel/events/hw_breakpoint.c | 2 +- 3 files changed, 168 insertions(+), 201 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6612732d8fd0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); -extern int __perf_event_disable(void *info); +extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index 66c9ad4f8707..6620432491f6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -126,6 +126,28 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + /* * On task ctx scheduling... * @@ -158,21 +180,96 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ -static void event_function_call(struct perf_event *event, - int (*active)(void *), - void (*inactive)(void *), - void *data) +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + * + * If ctx->task == current, we know things must remain valid because + * we have IRQs disabled so we cannot schedule. + */ + if (ctx->task) { + if (ctx->task != current) + return -EAGAIN; + + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Now that we hold locks, double check state. Paranoia pays. + */ + if (task_ctx) { + WARN_ON_ONCE(task_ctx->task != current); + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); + } + efs->func(event, cpuctx, ctx, efs->data); + perf_ctx_unlock(cpuctx, task_ctx); + + return 0; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; if (!task) { - cpu_function_call(event->cpu, active, data); + cpu_function_call(event->cpu, event_function, &efs); return; } again: - if (!task_function_call(task, active, data)) + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); @@ -185,7 +282,7 @@ again: raw_spin_unlock_irq(&ctx->lock); goto again; } - inactive(data); + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -400,28 +497,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -1684,38 +1759,22 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; - -static void ___perf_remove_from_context(void *info) -{ - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - - if (re->detach_group) - perf_group_detach(event); - list_del_event(event, ctx); -} - /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + bool detach_group = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (detach_group) perf_group_detach(event); list_del_event(event, ctx); @@ -1726,17 +1785,11 @@ static int __perf_remove_from_context(void *info) cpuctx->task_ctx = NULL; } } - raw_spin_unlock(&ctx->lock); - - return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1746,71 +1799,31 @@ static int __perf_remove_from_context(void *info) */ static void perf_remove_from_context(struct perf_event *event, bool detach_group) { - struct perf_event_context *ctx = event->ctx; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); event_function_call(event, __perf_remove_from_context, - ___perf_remove_from_context, &re); + (void *)(unsigned long)detach_group); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1837,8 +1850,12 @@ static void _perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_disable, - ___perf_event_disable, event); + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -2202,44 +2219,29 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - struct perf_event_context *task_ctx = cpuctx->task_ctx; - - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - perf_ctx_lock(cpuctx, task_ctx); - WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx); - update_context_time(ctx); + struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + return; + update_context_time(ctx); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { + perf_cgroup_set_timestamp(current, ctx); // XXX ? perf_cgroup_defer_enabled(event); - goto unlock; + } + return; } /* @@ -2247,19 +2249,13 @@ static int __perf_event_enable(void *info) * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; - ctx_resched(cpuctx, task_ctx); + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); -unlock: - perf_ctx_unlock(cpuctx, task_ctx); - - return 0; -} - -void ___perf_event_enable(void *info) -{ - __perf_event_mark_enabled((struct perf_event *)info); + ctx_resched(cpuctx, task_ctx); } /* @@ -2292,8 +2288,7 @@ static void _perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_enable, - ___perf_event_enable, event); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -4095,36 +4090,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static void ___perf_event_period(void *info) -{ - struct period_event *pe = info; - struct perf_event *event = pe->event; - u64 value = pe->value; - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); -} - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4144,14 +4117,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; u64 value; if (!is_sampling_event(event)) @@ -4166,10 +4135,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - pe.value = value; - - event_function_call(event, __perf_event_period, - ___perf_event_period, &pe); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4941,7 +4907,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -9239,13 +9205,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); -- cgit From 6a84f57241e1fb9fb6772256f538d1073359a32d Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Thu, 21 Jan 2016 14:02:39 -0800 Subject: raid6/algos.c : bug fix : Add the missing definitions to the pq.h file Adding these pr_info and pr_err definitions so as to allow code to be compiled successfully for testing in userspace, since the printk has been replaced by pr_info and pr_err in algos.c Absence of these definitions result in the compilation errors such as ' undefined reference to `pr_info' ' ' undefined reference to `pr_err' ' Cc: NeilBrown Cc: Anton Blanchard Cc: Fenghua Yu Signed-off-by: Gayatri Kammela Signed-off-by: Shaohua Li --- include/linux/raid/pq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index a7a06d1dcf9c..a0118d5929a9 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -152,6 +152,8 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, # define jiffies raid6_jiffies() # define printk printf +# define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +# define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__) # define GFP_KERNEL 0 # define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \ PROT_READ|PROT_WRITE, \ -- cgit From facc432faa59414bd7c60c307ff1645154a66c98 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Jan 2016 11:43:44 +0100 Subject: net: simplify napi_synchronize() to avoid warnings The napi_synchronize() function is defined twice: The definition for SMP builds waits for other CPUs to be done, while the uniprocessor variant just contains a barrier and ignores its argument. In the mvneta driver, this leads to a warning about an unused variable when we lookup the NAPI struct of another CPU and then don't use it: ethernet/marvell/mvneta.c: In function 'mvneta_percpu_notifier': ethernet/marvell/mvneta.c:2910:30: error: unused variable 'other_port' [-Werror=unused-variable] There are no other CPUs on a UP build, so that code never runs, but gcc does not know this. The nicest solution seems to be to turn the napi_synchronize() helper into an inline function for the UP case as well, as that leads gcc to not complain about the argument being unused. Once we do that, we can also combine the two cases into a single function definition and use if(IS_ENABLED()) rather than #ifdef to make it look a bit nicer. The warning first came up in linux-4.4, but I failed to catch it earlier. Signed-off-by: Arnd Bergmann Fixes: f86428854480 ("net: mvneta: Statically assign queues to CPUs") Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ac140dcb789..289c2314d766 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -512,7 +512,6 @@ static inline void napi_enable(struct napi_struct *n) clear_bit(NAPI_STATE_NPSVC, &n->state); } -#ifdef CONFIG_SMP /** * napi_synchronize - wait until NAPI is not running * @n: napi context @@ -523,12 +522,12 @@ static inline void napi_enable(struct napi_struct *n) */ static inline void napi_synchronize(const struct napi_struct *n) { - while (test_bit(NAPI_STATE_SCHED, &n->state)) - msleep(1); + if (IS_ENABLED(CONFIG_SMP)) + while (test_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + else + barrier(); } -#else -# define napi_synchronize(n) barrier() -#endif enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, -- cgit From 71f50c6d9a2276f3ec85384bffe2aee1962f4669 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Jan 2016 01:33:51 +0900 Subject: of: drop symbols declared by _OF_DECLARE() from modules The users of this macro (OF_EARLYCON_DECLARE, CLK_OF_DECLARE, IRQCHIP_DECLARE, etc.) are only parsed in the early boot stage. Such symbols contained in modules are never used. This commit fixes the link error introduced by commit b8d20e06eaad ("serial: 8250_uniphier: add earlycon support"); the combination of CONFIG_SERIAL_8250_UNIPHIER=m and CONFIG_SERIAL_8250_CONSOLE=y fails to link: ERROR: "early_serial8250_setup" [drivers/tty/serial/8250/8250_uniphier.ko] undefined! Fixes: b8d20e06eaad ("serial: 8250_uniphier: add earlycon support") Signed-off-by: Masahiro Yamada Acked-by: Arnd Bergmann Signed-off-by: Rob Herring --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index dd10626a615f..dc6e39696b64 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -929,7 +929,7 @@ static inline int of_get_available_child_count(const struct device_node *np) return num; } -#ifdef CONFIG_OF +#if defined(CONFIG_OF) && !defined(MODULE) #define _OF_DECLARE(table, name, compat, fn, fn_type) \ static const struct of_device_id __of_table_##name \ __used __section(__##table##_of_table) \ -- cgit From 530cbe100ef7587aa5b5ac3a4b670cda4d50e598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 13:52:25 +0000 Subject: irqdomain: Allow domain lookup with DOMAIN_BUS_WIRED token Let's take the (outlandish) example of an interrupt controller capable of handling both wired interrupts and PCI MSIs. With the current code, the PCI MSI domain is going to be tagged with DOMAIN_BUS_PCI_MSI, and the wired domain with DOMAIN_BUS_ANY. Things get hairy when we start looking up the domain for a wired interrupt (typically when creating it based on some firmware information - DT or ACPI). In irq_create_fwspec_mapping(), we perform the lookup using DOMAIN_BUS_ANY, which is actually used as a wildcard. This gives us one chance out of two to end up with the wrong domain, and we try to configure a wired interrupt with the MSI domain. Everything grinds to a halt pretty quickly. What we really need to do is to start looking for a domain that would uniquely identify a wired interrupt domain, and only use DOMAIN_BUS_ANY as a fallback. In order to solve this, let's introduce a new DOMAIN_BUS_WIRED token, which is going to be used exactly as described above. Of course, this depends on the irqchip to setup the domain bus_token, and nobody had to implement this so far. Only so far. Signed-off-by: Marc Zyngier Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Cc: Grant Likely Cc: Thomas Petazzoni Cc: Jiang Liu Link: http://lkml.kernel.org/r/1453816347-32720-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + kernel/irq/irqdomain.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f64622ad02c1..04579d9fbce4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -70,6 +70,7 @@ struct irq_fwspec { */ enum irq_domain_bus_token { DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8cf95de1ab3f..d75179735a28 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -575,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", -- cgit From 0bfd464d3fdd5bb322f9cace4cc47f1796545cf7 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 21:13:44 -0800 Subject: tty: Wait interruptibly for tty lock on reopen Allow a signal to interrupt the wait for a tty reopen; eg., if the tty has starting final close and is waiting for the device to drain. Signed-off-by: Peter Hurley Cc: stable # 4.4 Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 8 +++++++- drivers/tty/tty_mutex.c | 8 ++++++++ include/linux/tty.h | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 892c92354745..765935b144d6 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2065,7 +2065,12 @@ retry_open: if (tty) { mutex_unlock(&tty_mutex); - tty_lock(tty); + retval = tty_lock_interruptible(tty); + if (retval) { + if (retval == -EINTR) + retval = -ERESTARTSYS; + goto err_unref; + } /* safe to drop the kref from tty_driver_lookup_tty() */ tty_kref_put(tty); retval = tty_reopen(tty); @@ -2152,6 +2157,7 @@ retry_open: return 0; err_unlock: mutex_unlock(&tty_mutex); +err_unref: /* after locks to avoid deadlock */ if (!IS_ERR_OR_NULL(driver)) tty_driver_kref_put(driver); diff --git a/drivers/tty/tty_mutex.c b/drivers/tty/tty_mutex.c index 77703a391207..d2f3c4cd697f 100644 --- a/drivers/tty/tty_mutex.c +++ b/drivers/tty/tty_mutex.c @@ -19,6 +19,14 @@ void __lockfunc tty_lock(struct tty_struct *tty) } EXPORT_SYMBOL(tty_lock); +int tty_lock_interruptible(struct tty_struct *tty) +{ + if (WARN(tty->magic != TTY_MAGIC, "L Bad %p\n", tty)) + return -EIO; + tty_kref_get(tty); + return mutex_lock_interruptible(&tty->legacy_mutex); +} + void __lockfunc tty_unlock(struct tty_struct *tty) { if (WARN(tty->magic != TTY_MAGIC, "U Bad %p\n", tty)) diff --git a/include/linux/tty.h b/include/linux/tty.h index 2fd8708ea888..d9fb4b043f56 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -649,6 +649,7 @@ extern long vt_compat_ioctl(struct tty_struct *tty, /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void __lockfunc tty_lock(struct tty_struct *tty); +extern int tty_lock_interruptible(struct tty_struct *tty); extern void __lockfunc tty_unlock(struct tty_struct *tty); extern void __lockfunc tty_lock_slave(struct tty_struct *tty); extern void __lockfunc tty_unlock_slave(struct tty_struct *tty); -- cgit From b3c6de492b9ea30a8dcc535d4dc2eaaf0bb3f116 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 21 Jan 2016 16:47:29 +0100 Subject: cleancache: constify cleancache_ops structure The cleancache_ops structure is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/tmem.c | 2 +- include/linux/cleancache.h | 2 +- mm/cleancache.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 945fc4327201..4ac2ca8a7656 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -242,7 +242,7 @@ static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); } -static struct cleancache_ops tmem_cleancache_ops = { +static const struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, .invalidate_page = tmem_cleancache_flush_page, diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index bda5ec0b4b4d..cb3e14234502 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -37,7 +37,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern int cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); diff --git a/mm/cleancache.c b/mm/cleancache.c index 8fc50811119b..ba5d8f3e6d68 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -22,7 +22,7 @@ * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops *cleancache_ops __read_mostly; +static const struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/cleancache (if debugfs is @@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused) /* * Register operations for cleancache. Returns 0 on success. */ -int cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(const struct cleancache_ops *ops) { if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; -- cgit From a39bb9a0557a3fc860c3c9d67a7326b0d2f1bd66 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 7 Jan 2016 05:06:13 +0800 Subject: include/linux/cleancache.h: Clean up code Let cleancache_fs_enabled() call cleancache_fs_enabled_mapping() directly. Remove redundant variable ret in cleancache_get_page(). Signed-off-by: Chen Gang Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/cleancache.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index cb3e14234502..fccf7f44139d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -48,14 +48,14 @@ extern void __cleancache_invalidate_fs(struct super_block *); #ifdef CONFIG_CLEANCACHE #define cleancache_enabled (1) -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) { return mapping->host->i_sb->cleancache_poolid >= 0; } +static inline bool cleancache_fs_enabled(struct page *page) +{ + return cleancache_fs_enabled_mapping(page->mapping); +} #else #define cleancache_enabled (0) #define cleancache_fs_enabled(_page) (0) @@ -89,11 +89,9 @@ static inline void cleancache_init_shared_fs(struct super_block *sb) static inline int cleancache_get_page(struct page *page) { - int ret = -1; - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; + return __cleancache_get_page(page); + return -1; } static inline void cleancache_put_page(struct page *page) -- cgit From e03e7ee34fdd1c3ef494949a75cb8c61c7265fa9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Jan 2016 20:59:49 -0800 Subject: perf/bpf: Convert perf_event_array to use struct file Robustify refcounting. Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160126045947.GA40151@ast-mbp.thefacebook.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- kernel/bpf/arraymap.c | 21 +++++++++++---------- kernel/events/core.c | 21 ++++++++++----------- kernel/trace/bpf_trace.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6612732d8fd0..4f90434b8d64 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); -extern struct perf_event *perf_event_get(unsigned int fd); +extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } -static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..89ebbc4d1164 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/events/core.c b/kernel/events/core.c index fe97f95f204e..eb44730afea5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8916,21 +8916,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..326a75e884db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit From c6e5b73242d2d9172ea880483bc4ba7ffca0cfb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2016 16:07:41 +0200 Subject: perf: Synchronously clean up child events The orphan cleanup workqueue doesn't always catch orphans, for example, if they never schedule after they are orphaned. IOW, the event leak is still very real. It also wouldn't work for kernel counters. Doing it synchonously is a little hairy due to lock inversion issues, but is made to work. Patch based on work by Alexander Shishkin. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 - kernel/events/core.c | 174 ++++++++++++++++++++++----------------------- 2 files changed, 84 insertions(+), 93 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f90434b8d64..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -634,9 +634,6 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; - - struct delayed_work orphans_remove; - bool orphans_remove_sched; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index e549cf2accdd..98c862aff8fa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -1645,45 +1643,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && event->state == PERF_EVENT_STATE_EXIT; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; + return event->state == PERF_EVENT_STATE_EXIT; } -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1744,9 +1708,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1984,9 +1945,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -3369,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3782,11 +3739,22 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx; + struct perf_event *child, *tmp; + if (!is_kernel_event(event)) perf_remove_from_owner(event); @@ -3811,14 +3779,70 @@ static void put_event(struct perf_event *event) * At this point we must have event->state == PERF_EVENT_STATE_EXIT, * either from the above perf_remove_from_context() or through * perf_event_exit_event(). + * + * Therefore, anybody acquiring event->child_mutex after the below + * loop _must_ also see this, most importantly inherit_event() which + * will avoid placing more children on the list. + * + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + + /* Must be the last reference */ put_event(event); return 0; } @@ -3829,46 +3853,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, DETACH_GROUP); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -8719,7 +8707,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; + child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -8977,8 +8965,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9026,8 +9022,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); -- cgit From 0d9bacb6c8265e7aa75ac28ae9b5d7748065942f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Jan 2016 14:28:48 +0900 Subject: iommu: Update struct iommu_ops comments Update the comments around struct iommu_ops to match current state and fix a few typos while at it. Signed-off-by: Magnus Damm Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f28dff313b07..a5c539fa5d2b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -133,8 +133,9 @@ struct iommu_dm_region { /** * struct iommu_ops - iommu ops and capabilities - * @domain_init: init iommu domain - * @domain_destroy: destroy iommu domain + * @capable: check capability + * @domain_alloc: allocate iommu domain + * @domain_free: free iommu domain * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain @@ -144,8 +145,15 @@ struct iommu_dm_region { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes + * @get_dm_regions: Request list of direct mapping requirements for a device + * @put_dm_regions: Free list of direct mapping requirements for a device + * @domain_window_enable: Configure and enable a particular window for a domain + * @domain_window_disable: Disable a particular window for a domain + * @domain_set_windows: Set the number of windows for a domain + * @domain_get_windows: Return the number of windows for a domain * @of_xlate: add OF master IDs to iommu grouping * @pgsize_bitmap: bitmap of supported page sizes * @priv: per-instance data private to the iommu driver @@ -182,9 +190,9 @@ struct iommu_ops { int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t paddr, u64 size, int prot); void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr); - /* Set the numer of window per domain */ + /* Set the number of windows per domain */ int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count); - /* Get the numer of window per domain */ + /* Get the number of windows per domain */ u32 (*domain_get_windows)(struct iommu_domain *domain); #ifdef CONFIG_OF_IOMMU -- cgit From 65f87ee71852a754f7981d0653e7136039b8798a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 25 Jan 2016 17:23:18 -0800 Subject: fs, block: force direct-I/O for dax-enabled block devices Similar to the file I/O path, re-direct all I/O to the DAX path for I/O to a block-device special file. Both regular files and device special files can use the common filp->f_mapping->host lookup to determing is DAX is enabled. Otherwise, we confuse the DAX code that does not expect to find live data in the page cache: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 7676 at mm/filemap.c:217 __delete_from_page_cache+0x9f6/0xb60() Modules linked in: CPU: 0 PID: 7676 Comm: a.out Not tainted 4.4.0+ #276 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 00000000ffffffff ffff88006d3f7738 ffffffff82999e2d 0000000000000000 ffff8800620a0000 ffffffff86473d20 ffff88006d3f7778 ffffffff81352089 ffffffff81658d36 ffffffff86473d20 00000000000000d9 ffffea0000009d60 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] __delete_from_page_cache+0x9f6/0xb60 mm/filemap.c:217 [] delete_from_page_cache+0x112/0x200 mm/filemap.c:244 [] __dax_fault+0x859/0x1800 fs/dax.c:487 [] blkdev_dax_fault+0x26/0x30 fs/block_dev.c:1730 [< inline >] wp_pfn_shared mm/memory.c:2208 [] do_wp_page+0xc85/0x14f0 mm/memory.c:2307 [< inline >] handle_pte_fault mm/memory.c:3323 [< inline >] __handle_mm_fault mm/memory.c:3417 [] handle_mm_fault+0x2483/0x4640 mm/memory.c:3446 [] __do_page_fault+0x376/0x960 arch/x86/mm/fault.c:1238 [] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331 [] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264 [] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 ---[ end trace dae21e0f85f1f98c ]--- Fixes: 5a023cdba50c ("block: enable dax for raw block devices") Reported-by: Dmitry Vyukov Reported-by: Kirill A. Shutemov Suggested-by: Jan Kara Reviewed-by: Jan Kara Suggested-by: Matthew Wilcox Tested-by: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a2046275cdf..b10002d4a5f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2907,7 +2907,7 @@ extern void replace_mount_options(struct super_block *sb, char *options); static inline bool io_is_direct(struct file *filp) { - return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); + return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); } static inline int iocb_flags(struct file *file) -- cgit From 9f4736fe7ca804aa79b5916221bb13dfc6221a0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:13:39 -0800 Subject: block: revert runtime dax control of the raw block device Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox Cc: Andrew Morton Cc: Ross Zwisler Signed-off-by: Dan Williams --- block/ioctl.c | 38 -------------------------------------- fs/block_dev.c | 28 ---------------------------- include/linux/fs.h | 3 --- include/uapi/linux/fs.h | 1 - 4 files changed, 70 deletions(-) (limited to 'include/linux') diff --git a/block/ioctl.c b/block/ioctl.c index 77f5d17779d6..d8996bbd7f12 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -434,42 +434,6 @@ bool blkdev_dax_capable(struct block_device *bdev) return true; } - -static int blkdev_daxset(struct block_device *bdev, unsigned long argp) -{ - unsigned long arg; - int rc = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (get_user(arg, (int __user *)(argp))) - return -EFAULT; - arg = !!arg; - if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) - return 0; - - if (arg) - arg = S_DAX; - - if (arg && !blkdev_dax_capable(bdev)) - return -ENOTTY; - - inode_lock(bdev->bd_inode); - if (bdev->bd_map_count == 0) - inode_set_flags(bdev->bd_inode, arg, S_DAX); - else - rc = -EBUSY; - inode_unlock(bdev->bd_inode); - return rc; -} -#else -static int blkdev_daxset(struct block_device *bdev, int arg) -{ - if (arg) - return -ENOTTY; - return 0; -} #endif static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, @@ -634,8 +598,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); - case BLKDAXSET: - return blkdev_daxset(bdev, arg); case BLKDAXGET: return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); break; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7b9cd49622b1..afb437484362 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); } -static void blkdev_vm_open(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count++; - inode_unlock(bd_inode); -} - -static void blkdev_vm_close(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - inode_lock(bd_inode); - bdev->bd_map_count--; - inode_unlock(bd_inode); -} - static const struct vm_operations_struct blkdev_dax_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, .pfn_mkwrite = blkdev_dax_fault, }; static const struct vm_operations_struct blkdev_default_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, }; @@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - inode_lock(bd_inode); - bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &blkdev_default_vm_ops; } - inode_unlock(bd_inode); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 41e0433b4a83..149bec83a907 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -222,7 +222,6 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) -#define BLKDAXSET _IO(0x12,128) #define BLKDAXGET _IO(0x12,129) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ -- cgit From d1a5f2b4d8a125943dcb6b032fc7eaefc2c78296 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:25:31 -0800 Subject: block: use DAX for partition table reads Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Reported-by: Ross Zwisler Tested-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: Dan Williams --- block/partition-generic.c | 18 +++++++++++++++--- fs/dax.c | 20 ++++++++++++++++++++ include/linux/dax.h | 11 +++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index 746935a5973c..fefd01b496a0 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "partitions/check.h" @@ -550,13 +551,24 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; + + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), - NULL); + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; diff --git a/fs/dax.c b/fs/dax.c index 4fd6b0c5c6b5..e0e9358baf35 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -58,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); -- cgit From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- kernel/memremap.c | 2 +- tools/testing/nvdimm/test/iomap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } diff --git a/kernel/memremap.c b/kernel/memremap.c index cbc3e97e2bb4..70ee3775de24 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 7ec7df9e7fc7..0c1a7e65bb81 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, } EXPORT_SYMBOL(__wrap_devm_memremap_pages); -pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { struct nfit_test_resource *nfit_res = get_nfit_res(addr); -- cgit From a3d0a918502cc73af4f60da2cc4c5cac5573f183 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 2 Feb 2016 16:57:08 -0800 Subject: thp: make split_queue per-node Andrea Arcangeli suggested to make split queue per-node to improve scalability. Let's do it. Signed-off-by: Kirill A. Shutemov Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ mm/huge_memory.c | 49 ++++++++++++++++++++++++++----------------------- mm/page_alloc.c | 5 +++++ 3 files changed, 37 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 33bb1b19273e..7b6c2cfee390 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -682,6 +682,12 @@ typedef struct pglist_data { */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd3a07b3e6f4..253a25e007d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; -static DEFINE_SPINLOCK(split_queue_lock); -static LIST_HEAD(split_queue); -static unsigned long split_queue_len; static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) @@ -3358,6 +3355,7 @@ int total_mapcount(struct page *page) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; @@ -3401,19 +3399,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { if (!list_empty(page_deferred_list(head))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3421,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } @@ -3436,52 +3434,56 @@ out: void free_transhuge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - split_queue_len--; + pgdata->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { - list_add_tail(page_deferred_list(page), &split_queue); - split_queue_len++; + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); /* * Split a page from split_queue will free up at least one page, * at most HPAGE_PMD_NR - 1. We don't track exact number. * Let's use HPAGE_PMD_NR / 2 as ballpark. */ - return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; + return ACCESS_ONCE(pgdata->split_queue_len) * HPAGE_PMD_NR / 2; } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_init(&split_queue, &list); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_init(&pgdata->split_queue, &list); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &list) { @@ -3490,10 +3492,10 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, /* race with put_compound_page() */ if (!get_page_unless_zero(page)) { list_del_init(page_deferred_list(page)); - split_queue_len--; + pgdata->split_queue_len--; } } - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -3505,9 +3507,9 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&split_queue_lock, flags); - list_splice_tail(&list, &split_queue); - spin_unlock_irqrestore(&split_queue_lock, flags); + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); return split * HPAGE_PMD_NR / 2; } @@ -3516,6 +3518,7 @@ static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63358d9f9aa9..ea2c4d3e0c03 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5209,6 +5209,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); -- cgit From 65376df582174ffcec9e6471bf5b0dd79ba05e4a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:29 -0800 Subject: proc: revert /proc//maps [stack:TID] annotation Commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") added [stack:TID] annotation to /proc//maps. Finding the task of a stack VMA requires walking the entire thread list, turning this into quadratic behavior: a thousand threads means a thousand stacks, so the rendering of /proc//maps needs to look at a million combinations. The cost is not in proportion to the usefulness as described in the patch. Drop the [stack:TID] annotation to make /proc//maps (and /proc//numa_maps) usable again for higher thread counts. The [stack] annotation inside /proc//task//maps is retained, as identifying the stack VMA there is an O(1) operation. Siddesh said: "The end users needed a way to identify thread stacks programmatically and there wasn't a way to do that. I'm afraid I no longer remember (or have access to the resources that would aid my memory since I changed employers) the details of their requirement. However, I did do this on my own time because I thought it was an interesting project for me and nobody really gave any feedback then as to its utility, so as far as I am concerned you could roll back the main thread maps information since the information is available in the thread-specific files" Signed-off-by: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Siddhesh Poyarekar Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 9 ++---- fs/proc/task_mmu.c | 66 +++++++++++++------------------------- fs/proc/task_nommu.c | 49 ++++++++++++---------------- include/linux/mm.h | 3 +- mm/util.c | 27 +--------------- 5 files changed, 48 insertions(+), 106 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fde9fd06fa98..eaebf27539f5 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -356,7 +356,7 @@ address perms offset dev inode pathname a7cb1000-a7cb2000 ---p 00000000 00:00 0 a7cb2000-a7eb2000 rw-p 00000000 00:00 0 a7eb2000-a7eb3000 ---p 00000000 00:00 0 -a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001] +a7eb3000-a7ed5000 rw-p 00000000 00:00 0 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6 a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6 @@ -388,7 +388,6 @@ is not associated with a file: [heap] = the heap of the program [stack] = the stack of the main process - [stack:1001] = the stack of the thread with tid 1001 [vdso] = the "virtual dynamic shared object", the kernel system call handler @@ -396,10 +395,8 @@ is not associated with a file: The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint of the individual tasks of a process. In this file you will see a mapping marked -as [stack] if that task sees it as a stack. This is a key difference from the -content of /proc/PID/maps, where you will see all mappings that are being used -as stack by all of those tasks. Hence, for the example above, the task-level -map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: +as [stack] if that task sees it as a stack. Hence, for the example above, the +task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this: 08048000-08049000 r-xp 00000000 03:00 8312 /opt/test 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4a0c31f904a6..fa95ab2d3674 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + int stack = 0; + + if (is_pid) { + stack = vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } static void @@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = arch_vma_name(vma); if (!name) { - pid_t tid; - if (!mm) { name = "[vdso]"; goto done; @@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = pid_of_stack(priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } else { - /* Thread stack in /proc/PID/maps */ - seq_pad(m, ' '); - seq_printf(m, "[stack:%d]", tid); - } - } + if (is_stack(priv, vma, is_pid)) + name = "[stack]"; } done: @@ -1618,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else { - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_puts(m, " stack"); - else - seq_printf(m, " stack:%d", tid); - } + } else if (is_stack(proc_priv, vma, is_pid)) { + seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index e0d64c92e4f6..faacb0c0d857 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + struct mm_struct *mm = vma->vm_mm; + int stack = 0; + + if (is_pid) { + stack = vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } /* @@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm) { - pid_t tid = pid_of_stack(priv, vma, is_pid); - - if (tid != 0) { - seq_pad(m, ' '); - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_printf(m, "[stack]"); - else - seq_printf(m, "[stack:%d]", tid); - } + } else if (mm && is_stack(priv, vma, is_pid)) { + seq_pad(m, ' '); + seq_printf(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/include/linux/mm.h b/include/linux/mm.h index f1cd22f2df1a..0b50d7848e3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1341,8 +1341,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -extern struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group); +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, diff --git a/mm/util.c b/mm/util.c index c108a6542d05..4fb14ca5a419 100644 --- a/mm/util.c +++ b/mm/util.c @@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { -- cgit From c792e8240338e41eda4d06a3a71a7bb7af4e6156 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Feb 2016 16:57:38 -0800 Subject: mm: memcontrol: drop superfluous entry in the per-memcg stats array MEM_CGROUP_STAT_NSTATS is just a delimiter for cgroup1 statistics, not an actual array entry. Reuse it for the first cgroup2 stat entry, like in the event array. Fixes: b2807f07f4f8 ("mm: memcontrol: add "sock" to cgroup2 memory.stat") Signed-off-by: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ae48d4aeb5e..792c8981e633 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -51,7 +51,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK, + MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, MEMCG_NR_STAT, }; -- cgit From 30bdbb78009e67767983085e302bec6d97afc679 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 2 Feb 2016 16:57:46 -0800 Subject: mm: polish virtual memory accounting * add VM_STACK as alias for VM_GROWSUP/DOWN depending on architecture * always account VMAs with flag VM_STACK as stack (as it was before) * cleanup classifying helpers * update comments and documentation Signed-off-by: Konstantin Khlebnikov Tested-by: Sudip Mukherjee Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++-- include/linux/mm.h | 6 ++++-- include/linux/mm_types.h | 6 +++--- mm/internal.h | 23 +++++++++++++++++++---- 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index eaebf27539f5..843b045b4069 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -240,8 +240,8 @@ Table 1-2: Contents of the status files (as of 4.1) RssFile size of resident file mappings RssShmem size of resident shmem memory (includes SysV shm, mapping of tmpfs and shared anonymous mappings) - VmData size of data, stack, and text segments - VmStk size of data, stack, and text segments + VmData size of private data segments + VmStk size of stack segments VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b50d7848e3a..516e14944339 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,11 +201,13 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STACK VM_GROWSUP #else -#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STACK VM_GROWSDOWN #endif +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + /* * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d3ebb9d21a53..624b78b848b8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -424,9 +424,9 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ unsigned long pinned_vm; /* Refcount permanently increased */ - unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */ - unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ - unsigned long stack_vm; /* VM_GROWSUP/DOWN */ + unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ + unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ + unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; diff --git a/mm/internal.h b/mm/internal.h index 6e976302ddd8..a38a21ebddb4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -216,20 +216,35 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ static inline bool is_exec_mapping(vm_flags_t flags) { - return (flags & (VM_EXEC | VM_WRITE)) == VM_EXEC; + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } +/* + * Stack area - atomatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ static inline bool is_stack_mapping(vm_flags_t flags) { - return (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) != 0; + return (flags & VM_STACK) == VM_STACK; } +/* + * Data area - private, writable, not stack + */ static inline bool is_data_mapping(vm_flags_t flags) { - return (flags & ((VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)) | - VM_WRITE | VM_SHARED)) == VM_WRITE; + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } /* mm/util.c */ -- cgit From 46437f9a554fbe3e110580ca08ab703b59f2f95a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 2 Feb 2016 16:57:52 -0800 Subject: radix-tree: fix race in gang lookup If the indirect_ptr bit is set on a slot, that indicates we need to redo the lookup. Introduce a new function radix_tree_iter_retry() which forces the loop to retry the lookup by setting 'slot' to NULL and turning the iterator back to point at the problematic entry. This is a pretty rare problem to hit at the moment; the lookup has to race with a grow of the radix tree from a height of 0. The consequences of hitting this race are that gang lookup could return a pointer to a radix_tree_node instead of a pointer to whatever the user had inserted in the tree. Fixes: cebbd29e1c2f ("radix-tree: rewrite gang lookup using iterator") Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Ohad Ben-Cohen Cc: Konstantin Khlebnikov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ lib/radix-tree.c | 12 ++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7c88ad156a29..00b17c526c1f 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -378,6 +378,22 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags); +/** + * radix_tree_iter_retry - retry this chunk of the iteration + * @iter: iterator state + * + * If we iterate over a tree protected only by the RCU lock, a race + * against deletion or creation may result in seeing a slot for which + * radix_tree_deref_retry() returns true. If so, call this function + * and continue the iteration. + */ +static inline __must_check +void **radix_tree_iter_retry(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * diff --git a/lib/radix-tree.c b/lib/radix-tree.c index fcf5d98574ce..6b79e9026e24 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1019,9 +1019,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (++ret == max_items) break; } @@ -1098,9 +1102,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (++ret == max_items) break; } -- cgit From dc6ae6d8e7726bad4f1c87244b49cac851746c65 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 31 Jan 2016 14:36:07 +0100 Subject: crush: add chooseleaf_stable tunable Add a tunable to fix the bug that chooseleaf may cause unnecessary pg migrations when some device fails. Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 8 +++++++- net/ceph/crush/mapper.c | 18 ++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 48b49305716b..be8f12b8f195 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -59,7 +59,8 @@ enum { CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, - CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 }; /* @@ -205,6 +206,11 @@ struct crush_map { * mappings line up a bit better with previous mappings. */ __u8 chooseleaf_vary_r; + /* if true, it makes chooseleaf firstn to return stable results (if + * no local retry) so that data migrations would be optimal when some + * device fails. */ + __u8 chooseleaf_stable; + #ifndef __KERNEL__ /* * version 0 (original) of straw_calc has various flaws. version 1 diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index abb700621e4a..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { -- cgit From 97db9a88186e3a7d3a1942370c836bf221d3ab90 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 1 Feb 2016 17:08:33 +0100 Subject: libceph: advertise support for TUNABLES5 Add TUNABLES5 feature (chooseleaf_stable tunable) to a set of features supported by default. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index f89b31d45cc8..c3b211c9fe83 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -63,6 +63,16 @@ #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ +#define CEPH_FEATURE_MON_METADATA (1ULL<<50) +#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ +#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) +#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) +#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) +#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) +#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ +#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ +#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ +#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -108,7 +118,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ - CEPH_FEATURE_CRUSH_V4) + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_CRUSH_TUNABLES5) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit From b0b31a8ffe54abf0a455bcaee54dd92f08817164 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 3 Feb 2016 15:25:48 +0100 Subject: libceph: MOSDOpReply v7 encoding Empty request_redirect_t (struct ceph_request_redirect in the kernel client) is now encoded with a bool. NEW_OSDOPREPLY_ENCODING feature bit overlaps with already supported CRUSH_TUNABLES5. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 5 ++++- net/ceph/osd_client.c | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index c3b211c9fe83..c1ef6f14e7be 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -73,6 +73,8 @@ #define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ #define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ +// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 +#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -119,7 +121,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_CRUSH_V4 | \ - CEPH_FEATURE_CRUSH_TUNABLES5) + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..3534e12683d3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; -- cgit From 080fe2068e1c7f19f565b30b78baf78edf16a980 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Feb 2016 15:36:41 -0800 Subject: mm, hugetlb: don't require CMA for runtime gigantic pages Commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") has added the runtime gigantic page allocation via alloc_contig_range(), making this support available only when CONFIG_CMA is enabled. Because it doesn't depend on MIGRATE_CMA pageblocks and the associated infrastructure, it is possible with few simple adjustments to require only CONFIG_MEMORY_ISOLATION instead of full CONFIG_CMA. After this patch, alloc_contig_range() and related functions are available and used for gigantic pages with just CONFIG_MEMORY_ISOLATION enabled. Note CONFIG_CMA selects CONFIG_MEMORY_ISOLATION. This allows supporting runtime gigantic pages without the CMA-specific checks in page allocator fastpaths. Signed-off-by: Vlastimil Babka Cc: Luiz Capitulino Cc: Kirill A. Shutemov Cc: Zhang Yanfei Cc: Yasuaki Ishimatsu Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 4 ++-- include/linux/gfp.h | 6 +++--- mm/hugetlb.c | 2 +- mm/page_alloc.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 42982b26e32b..740d7ac03a55 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static __init int gigantic_pages_init(void) { - /* With CMA we can allocate gigantic pages at runtime */ + /* With compaction or CMA we can allocate gigantic pages at runtime */ if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 28ad5f6494b0..af1f2b24bbe4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#ifdef CONFIG_CMA - +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); +#endif +#ifdef CONFIG_CMA /* CMA stuff */ extern void init_cma_reserved_pageblock(struct page *page); - #endif #endif /* __LINUX_GFP_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7a802427ea8..06ae13e869d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea2c4d3e0c03..838ca8bb64f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { -- cgit From 12352d3cae2cebe18805a91fab34b534d7444231 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:36:50 -0800 Subject: mm: replace vma_lock_anon_vma with anon_vma_lock_read/write Sequence vma_lock_anon_vma() - vma_unlock_anon_vma() isn't safe if anon_vma appeared between lock and unlock. We have to check anon_vma first or call anon_vma_prepare() to be sure that it's here. There are only few users of these legacy helpers. Let's get rid of them. This patch fixes anon_vma lock imbalance in validate_mm(). Write lock isn't required here, read lock is enough. And reorders expand_downwards/expand_upwards: security_mmap_addr() and wrapping-around check don't have to be under anon vma lock. Link: https://lkml.kernel.org/r/CACT4Y+Y908EjM2z=706dv4rV6dWtxTLK9nFg9_7DhRMLppBo2g@mail.gmail.com Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 14 ------------- mm/mmap.c | 55 ++++++++++++++++++++++++---------------------------- 2 files changed, 25 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bdf597c4f0be..a07f42bedda3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) __put_anon_vma(anon_vma); } -static inline void vma_lock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - down_write(&anon_vma->root->rwsem); -} - -static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = vma->anon_vma; - if (anon_vma) - up_write(&anon_vma->root->rwsem); -} - static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); diff --git a/mm/mmap.c b/mm/mmap.c index 918c9ec5043f..2f2415a7a688 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -459,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -2145,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2188,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2211,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2227,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2263,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2284,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; -- cgit From 732042821cfa106b3c20b9780e4c60fee9d68900 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 5 Feb 2016 15:37:01 -0800 Subject: radix-tree: fix oops after radix_tree_iter_retry Helper radix_tree_iter_retry() resets next_index to the current index. In following radix_tree_next_slot current chunk size becomes zero. This isn't checked and it tries to dereference null pointer in slot. Tagged iterator is fine because retry happens only at slot 0 where tag bitmask in iter->tags is filled with single bit. Fixes: 46437f9a554f ("radix-tree: fix race in gang lookup") Signed-off-by: Konstantin Khlebnikov Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Ohad Ben-Cohen Cc: Jeremiah Mahler Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 00b17c526c1f..f54be7082207 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) * @iter: pointer to radix tree iterator * Returns: current chunk size */ -static __always_inline unsigned +static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; @@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return slot + offset + 1; } } else { - unsigned size = radix_tree_chunk_size(iter) - 1; + long size = radix_tree_chunk_size(iter); - while (size--) { + while (--size > 0) { slot++; iter->index++; if (likely(*slot)) -- cgit