From ac8dec420970f5cbaf2f6eda39153a60ec5b257b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 18 Nov 2019 15:19:35 -0800
Subject: locking/percpu-rwsem: Fold __percpu_up_read()

Now that __percpu_up_read() is only ever used from percpu_up_read()
merge them, it's a small function.

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20200131151540.212415454@infradead.org
---
 kernel/exit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..f64a8f9d412a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 		wake_up_process(task);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 
 /*
  * Determine if a process group is "orphaned", according to the POSIX
-- 
cgit 


From 7bc3e6e55acf065500a24621f3b313e7e5998acf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 19 Feb 2020 18:22:26 -0600
Subject: proc: Use a list of inodes to flush from proc

Rework the flushing of proc to use a list of directory inodes that
need to be flushed.

The list is kept on struct pid not on struct task_struct, as there is
a fixed connection between proc inodes and pids but at least for the
case of de_thread the pid of a task_struct changes.

This removes the dependency on proc_mnt which allows for different
mounts of proc having different mount options even in the same pid
namespace and this allows for the removal of proc_mnt which will
trivially the first mount of proc to honor it's mount options.

This flushing remains an optimization.  The functions
pid_delete_dentry and pid_revalidate ensure that ordinary dcache
management will not attempt to use dentries past the point their
respective task has died.  When unused the shrinker will
eventually be able to remove these dentries.

There is a case in de_thread where proc_flush_pid can be
called early for a given pid.  Which winds up being
safe (if suboptimal) as this is just an optiimization.

Only pid directories are put on the list as the other
per pid files are children of those directories and
d_invalidate on the directory will get them as well.

So that the pid can be used during flushing it's reference count is
taken in release_task and dropped in proc_flush_pid.  Further the call
of proc_flush_pid is moved after the tasklist_lock is released in
release_task so that it is certain that the pid has already been
unhashed when flushing it taking place.  This removes a small race
where a dentry could recreated.

As struct pid is supposed to be small and I need a per pid lock
I reuse the only lock that currently exists in struct pid the
the wait_pidfd.lock.

The net result is that this adds all of this functionality
with just a little extra list management overhead and
a single extra pointer in struct pid.

v2: Initialize pid->inodes.  I somehow failed to get that
    initialization into the initial version of the patch.  A boot
    failure was reported by "kernel test robot <lkp@intel.com>", and
    failure to initialize that pid->inodes matches all of the reported
    symptoms.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c          | 111 ++++++++++++++++--------------------------------
 fs/proc/inode.c         |   2 +-
 fs/proc/internal.h      |   1 +
 include/linux/pid.h     |   1 +
 include/linux/proc_fs.h |   4 +-
 kernel/exit.c           |   4 +-
 kernel/pid.c            |   1 +
 7 files changed, 45 insertions(+), 79 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..e7efe9d6f3d6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	*rgid = gid;
 }
 
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+	struct pid *pid = ei->pid;
+
+	if (S_ISDIR(ei->vfs_inode.i_mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_del_init_rcu(&ei->sibling_inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
+	put_pid(pid);
+}
+
 struct inode *proc_pid_make_inode(struct super_block * sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
+	struct pid *pid;
 
 	/* We need a new inode */
 
@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	/*
 	 * grab the reference to task.
 	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
+	pid = get_task_pid(task, PIDTYPE_PID);
+	if (!pid)
 		goto out_unlock;
 
+	/* Let the pid remember us for quick removal */
+	ei->pid = pid;
+	if (S_ISDIR(mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
 
@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.permission	= proc_pid_permission,
 };
 
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
-	struct dentry *dentry, *leader, *dir;
-	char buf[10 + 1];
-	struct qstr name;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	/* no ->d_hash() rejects on procfs */
-	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	if (pid == tgid)
-		return;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
-	leader = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (!leader)
-		goto out;
-
-	name.name = "task";
-	name.len = strlen(name.name);
-	dir = d_hash_and_lookup(leader, &name);
-	if (!dir)
-		goto out_put_leader;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	dentry = d_hash_and_lookup(dir, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	dput(dir);
-out_put_leader:
-	dput(leader);
-out:
-	return;
-}
-
 /**
- * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
  *
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
  *
  * It is safe and reasonable to cache /proc entries for a task until
  * that task exits.  After that they just clog up the dcache with
  * useless entries, possibly causing useful dcache entries to be
- * flushed instead.  This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead.  This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
  *
  * NOTE: This routine is just an optimization so it does not guarantee
- *       that no dcache entries will exist at process exit time it
- *       just makes it very unlikely that any will persist.
+ *       that no dcache entries will exist after a process is reaped
+ *       it just makes it very unlikely that any will persist.
  */
 
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
 {
-	int i;
-	struct pid *pid, *tgid;
-	struct upid *upid;
-
-	pid = task_pid(task);
-	tgid = task_tgid(task);
-
-	for (i = 0; i <= pid->level; i++) {
-		upid = &pid->numbers[i];
-		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-					tgid->numbers[i].nr);
-	}
+	proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+	put_pid(pid);
 }
 
 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d9243b24554a..1e730ea1dcd6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
 
 	/* Stop tracking associated processes */
 	if (ei->pid) {
-		put_pid(ei->pid);
+		proc_pid_evict_inode(ei);
 		ei->pid = NULL;
 	}
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index fd470172675f..9e294f0290e5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 998ae7d24450..01a0d4e28506 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -62,6 +62,7 @@ struct pid
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
+	struct hlist_head inodes;
 	/* wait queue for pidfd notifications */
 	wait_queue_head_t wait_pidfd;
 	struct rcu_head rcu;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3dfa92633af3..40a7982b7285 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -32,7 +32,7 @@ struct proc_ops {
 typedef int (*proc_write_t)(struct file *, char *, size_t);
 
 extern void proc_root_init(void);
-extern void proc_flush_task(struct task_struct *);
+extern void proc_flush_pid(struct pid *);
 
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
 {
 }
 
-static inline void proc_flush_task(struct task_struct *task)
+static inline void proc_flush_pid(struct pid *pid)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..502b4995b688 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
 void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
+	struct pid *thread_pid;
 	int zap_leader;
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
@@ -199,11 +200,11 @@ repeat:
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 
-	proc_flush_task(p);
 	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
+	thread_pid = get_pid(p->thread_pid);
 	__exit_signal(p);
 
 	/*
@@ -226,6 +227,7 @@ repeat:
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	proc_flush_pid(thread_pid);
 	release_thread(p);
 	put_task_struct_rcu_user(p);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 0f4ecb57214c..ca08d6a3aa77 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -258,6 +258,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
 	init_waitqueue_head(&pid->wait_pidfd);
+	INIT_HLIST_HEAD(&pid->inodes);
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-- 
cgit 


From b95e31c07c5eb4f5c0769f12b38b0343d7961040 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2020 11:15:03 -0600
Subject: posix-cpu-timers: Stop disabling timers on mt-exec

The reasons why the extra posix_cpu_timers_exit_group() invocation has been
added are not entirely clear from the commit message.  Today all that
posix_cpu_timers_exit_group() does is stop timers that are tracking the
task from firing.  Every other operation on those timers is still allowed.

The practical implication of this is posix_cpu_timer_del() which could
not get the siglock after the thread group leader has exited (because
sighand == NULL) would be able to run successfully because the timer
was already dequeued.

With that locking issue fixed there is no point in disabling all of the
timers.  So remove this ``tempoary'' hack.

Fixes: e0a70217107e ("posix-cpu-timers: workaround to suppress the problems with mt exec")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/87o8tityzs.fsf@x220.int.ebiederm.org
---
 kernel/exit.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..df546315f699 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -103,17 +103,8 @@ static void __exit_signal(struct task_struct *tsk)
 
 #ifdef CONFIG_POSIX_TIMERS
 	posix_cpu_timers_exit(tsk);
-	if (group_dead) {
+	if (group_dead)
 		posix_cpu_timers_exit_group(tsk);
-	} else {
-		/*
-		 * This can only happen if the caller is de_thread().
-		 * FIXME: this is the temporary hack, we should teach
-		 * posix-cpu-timers to handle this case correctly.
-		 */
-		if (unlikely(has_group_leader_pid(tsk)))
-			posix_cpu_timers_exit_group(tsk);
-	}
 #endif
 
 	if (group_dead) {
-- 
cgit