From 490dea45d00f01847ebebd007685d564aaf2cd98 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Nov 2008 17:06:57 +0100
Subject: itimers: remove the per-cpu-ish-ness

Either we bounce once cacheline per cpu per tick, yielding n^2 bounces
or we just bounce a single..

Also, using per-cpu allocations for the thread-groups complicates the
per-cpu allocator in that its currently aimed to be a fixed sized
allocator and the only possible extention to that would be vmap based,
which is seriously constrained on 32 bit archs.

So making the per-cpu memory requirement depend on the number of
processes is an issue.

Lastly, it didn't deal with cpu-hotplug, although admittedly that might
be fixable.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..c20943eabb4c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -450,6 +450,7 @@ struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
+	spinlock_t lock;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
@@ -465,7 +466,7 @@ struct task_cputime {
  * used for thread group CPU clock calculations.
  */
 struct thread_group_cputime {
-	struct task_cputime *totals;
+	struct task_cputime totals;
 };
 
 /*
@@ -2180,24 +2181,30 @@ static inline int spin_needbreak(spinlock_t *lock)
  * Thread group CPU time accounting.
  */
 
-extern int thread_group_cputime_alloc(struct task_struct *);
-extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
-
-static inline void thread_group_cputime_init(struct signal_struct *sig)
+static inline
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
-	sig->cputime.totals = NULL;
+	struct task_cputime *totals = &tsk->signal->cputime.totals;
+	unsigned long flags;
+
+	spin_lock_irqsave(&totals->lock, flags);
+	*times = *totals;
+	spin_unlock_irqrestore(&totals->lock, flags);
 }
 
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
+static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
-	if (curr->signal->cputime.totals)
-		return 0;
-	return thread_group_cputime_alloc(curr);
+	sig->cputime.totals = (struct task_cputime){
+		.utime = cputime_zero,
+		.stime = cputime_zero,
+		.sum_exec_runtime = 0,
+	};
+
+	spin_lock_init(&sig->cputime.totals.lock);
 }
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
-	free_percpu(sig->cputime.totals);
 }
 
 /*
-- 
cgit 


From baf48f6577e581a9adb8fe849dc80e24b21d171d Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Mon, 12 Jan 2009 21:15:17 -0800
Subject: softlock: fix false panic which can occur if softlockup_thresh is
 reduced

At run-time, if softlockup_thresh is changed to a much lower value,
touch_timestamp is likely to be much older than the new softlock_thresh.

This will cause a false softlockup to be detected. If softlockup_panic
is enabled, the system will panic.

The fix is to touch all watchdogs before changing softlockup_thresh.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 +++
 kernel/softlockup.c   | 9 +++++++++
 kernel/sysctl.c       | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..54cbabf3b871 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -293,6 +293,9 @@ extern void sched_show_task(struct task_struct *p);
 extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
+extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+				    struct file *filp, void __user *buffer,
+				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..85d5a2455103 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
 #include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 
 #include <asm/irq_regs.h>
 
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
 }
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
 
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+			     struct file *filp, void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	touch_all_softlockup_watchdogs();
+	return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+}
+
 /*
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89d74436318c..596dc31a7116 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -800,7 +800,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_dosoftlockup_thresh,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
-- 
cgit 


From 9df04e1f25effde823a600e755b51475d438f56b Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 29 Jan 2009 14:25:26 -0800
Subject: epoll: drop max_user_instances and rely only on max_user_watches

Linus suggested to put limits where the money is, and max_user_watches
already does that w/out the need of max_user_instances.  That has the
advantage to mitigate the potential DoS while allowing pretty generous
default behavior.

Allowing top 4% of low memory (per user) to be allocated in epoll watches,
we have:

LOMEM    MAX_WATCHES (per user)
512MB    ~178000
1GB      ~356000
2GB      ~712000

A box with 512MB of lomem, will meet some challenge in hitting 180K
watches, socket buffers math teaches us.  No more max_user_instances
limits then.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Bron Gondwana <brong@fastmail.fm>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c        | 22 ++++------------------
 include/linux/sched.h |  1 -
 2 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ba2f9ec71192..011b9b8c90c6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
-/* Maximum number of epoll devices, per user */
-static int max_user_instances __read_mostly;
 /* Maximum number of epoll watched descriptors, per user */
 static int max_user_watches __read_mostly;
 
@@ -260,14 +258,6 @@ static struct kmem_cache *pwq_cache __read_mostly;
 static int zero;
 
 ctl_table epoll_table[] = {
-	{
-		.procname	= "max_user_instances",
-		.data		= &max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &zero,
-	},
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
-	atomic_dec(&ep->user->epoll_devs);
 	free_uid(ep->user);
 	kfree(ep);
 }
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
 	struct eventpoll *ep;
 
 	user = get_current_user();
-	error = -EMFILE;
-	if (unlikely(atomic_read(&user->epoll_devs) >=
-			max_user_instances))
-		goto free_uid;
 	error = -ENOMEM;
 	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 	if (unlikely(!ep))
@@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
-	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
 	struct sysinfo si;
 
 	si_meminfo(&si);
-	max_user_instances = 128;
-	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+	/*
+	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
+	 */
+	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
 		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 02e16d207304..5a7c76388731 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,7 +630,6 @@ struct user_struct {
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
 #ifdef CONFIG_EPOLL
-	atomic_t epoll_devs;	/* The number of epoll descriptors currently open */
 	atomic_t epoll_watches;	/* The number of file descriptors currently watched */
 #endif
 #ifdef CONFIG_POSIX_MQUEUE
-- 
cgit 


From 35626129abcd6a7547e84c817ef5b6eff7a8758b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 2 Feb 2009 16:00:29 -0800
Subject: sched: add missing kernel-doc in sched.h

Add kernel-doc notation for @lock:

include/linux/sched.h:457: No description found for parameter 'lock'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a7c76388731..2127e959e0f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,6 +443,7 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ * @lock:		lock for fields in this struct
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
-- 
cgit