From 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 19:13:49 -0800 Subject: namespaces: Use task_lock and not rcu to protect nsproxy The synchronous syncrhonize_rcu in switch_task_namespaces makes setns a sufficiently expensive system call that people have complained. Upon inspect nsproxy no longer needs rcu protection for remote reads. remote reads are rare. So optimize for same process reads and write by switching using rask_lock instead. This yields a simpler to understand lock, and a faster setns system call. In particular this fixes a performance regression observed by Rafael David Tinoco . This is effectively a revert of Pavel Emelyanov's commit cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter from 2007. The race this originialy fixed no longer exists as do_notify_parent uses task_active_pid_ns(parent) instead of parent->nsproxy. Signed-off-by: "Eric W. Biederman" --- ipc/namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'ipc') diff --git a/ipc/namespace.c b/ipc/namespace.c index 59451c1e214d..b54468e48e32 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task) struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); - rcu_read_unlock(); + task_unlock(task); return ns; } -- cgit From ab602f799159393143d567e5c04b936fec79d6bd Mon Sep 17 00:00:00 2001 From: Jack Miller Date: Fri, 8 Aug 2014 14:23:19 -0700 Subject: shm: make exit_shm work proportional to task activity This is small set of patches our team has had kicking around for a few versions internally that fixes tasks getting hung on shm_exit when there are many threads hammering it at once. Anton wrote a simple test to cause the issue: http://ozlabs.org/~anton/junkcode/bust_shm_exit.c Before applying this patchset, this test code will cause either hanging tracebacks or pthread out of memory errors. After this patchset, it will still produce output like: root@somehost:~# ./bust_shm_exit 1024 160 ... INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113) INFO: Stall ended before state dump start ... But the task will continue to run along happily, so we consider this an improvement over hanging, even if it's a bit noisy. This patch (of 3): exit_shm obtains the ipc_ns shm rwsem for write and holds it while it walks every shared memory segment in the namespace. Thus the amount of work is related to the number of shm segments in the namespace not the number of segments that might need to be cleaned. In addition, this occurs after the task has been notified the thread has exited, so the number of tasks waiting for the ns shm rwsem can grow without bound until memory is exausted. Add a list to the task struct of all shmids allocated by this task. Init the list head in copy_process. Use the ns->rwsem for locking. Add segments after id is added, remove before removing from id. On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar to handling of semaphore undo. I chose a define for the init sequence since its a simple list init, otherwise it would require a function call to avoid include loops between the semaphore code and the task struct. Converting the list_del to list_del_init for the unshare cases would remove the exit followed by init, but I left it blow up if not inited. Signed-off-by: Milton Miller Signed-off-by: Jack Miller Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ include/linux/shm.h | 16 +++++++++++++++- ipc/shm.c | 22 +++++++++++----------- kernel/fork.c | 6 ++++++ 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'ipc') diff --git a/include/linux/sched.h b/include/linux/sched.h index b21e9218c0fd..db2f6474e95e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -33,6 +33,7 @@ struct sched_param { #include #include +#include #include #include #include @@ -1385,6 +1386,7 @@ struct task_struct { #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ diff --git a/include/linux/shm.h b/include/linux/shm.h index 57d77709fbe2..fd206387048a 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ +#include #include #include #include @@ -20,6 +21,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; + struct list_head shm_clist; /* list by creator */ }; /* shm_mode upper byte flags */ @@ -44,11 +46,20 @@ struct shmid_kernel /* private to the kernel */ #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) #ifdef CONFIG_SYSVIPC +struct sysv_shm { + struct list_head shm_clist; +}; + long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); extern int is_file_shm_hugepages(struct file *file); -extern void exit_shm(struct task_struct *task); +void exit_shm(struct task_struct *task); +#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else +struct sysv_shm { + /* empty */ +}; + static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba) @@ -62,6 +73,9 @@ static inline int is_file_shm_hugepages(struct file *file) static inline void exit_shm(struct task_struct *task) { } +static inline void shm_init_task(struct task_struct *task) +{ +} #endif #endif /* _LINUX_SHM_H_ */ diff --git a/ipc/shm.c b/ipc/shm.c index 89fc354156cb..1fc3a61b443b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head) static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { + list_del(&s->shm_clist); ipc_rmid(&shm_ids(ns), &s->shm_perm); } @@ -268,14 +269,10 @@ static void shm_close(struct vm_area_struct *vma) } /* Called with ns->shm_ids(ns).rwsem locked */ -static int shm_try_destroy_current(int id, void *p, void *data) +static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns) { - struct ipc_namespace *ns = data; - struct kern_ipc_perm *ipcp = p; - struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); - - if (shp->shm_creator != current) - return 0; + if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */ + return; /* * Mark it as orphaned to destroy the segment when @@ -289,13 +286,12 @@ static int shm_try_destroy_current(int id, void *p, void *data) * is not set, it shouldn't be deleted here. */ if (!ns->shm_rmid_forced) - return 0; + return; if (shm_may_destroy(ns, shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } - return 0; } /* Called with ns->shm_ids(ns).rwsem locked */ @@ -333,14 +329,17 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) void exit_shm(struct task_struct *task) { struct ipc_namespace *ns = task->nsproxy->ipc_ns; + struct shmid_kernel *shp, *n; if (shm_ids(ns).in_use == 0) return; /* Destroy all already created segments, but not mapped yet */ down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) - idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) + shm_mark_orphan(shp, ns); + /* remove the list head from any segments still attached */ + list_del(&task->sysvshm.shm_clist); up_write(&shm_ids(ns).rwsem); } @@ -561,6 +560,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); /* * shmid gets reported as "inode#" in /proc/pid/maps. diff --git a/kernel/fork.c b/kernel/fork.c index 86da59e165ad..fa9124322cd4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1362,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1913,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); -- cgit From 83293c0f5a6130bf7d60b7b406f4a4de0cadd3d8 Mon Sep 17 00:00:00 2001 From: Jack Miller Date: Fri, 8 Aug 2014 14:23:21 -0700 Subject: shm: allow exit_shm in parallel if only marking orphans If shm_rmid_force (the default state) is not set then the shmids are only marked as orphaned and does not require any add, delete, or locking of the tree structure. Seperate the sysctl on and off case, and only obtain the read lock. The newly added list head can be deleted under the read lock because we are only called with current and will only change the semids allocated by this task and not manipulate the list. This commit assumes that up_read includes a sufficient memory barrier for the writes to be seen my others that later obtain a write lock. Signed-off-by: Milton Miller Signed-off-by: Jack Miller Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 67 +++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 32 deletions(-) (limited to 'ipc') diff --git a/ipc/shm.c b/ipc/shm.c index 1fc3a61b443b..7fc9f9f3a26b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -268,32 +268,6 @@ static void shm_close(struct vm_area_struct *vma) up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rwsem locked */ -static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns) -{ - if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */ - return; - - /* - * Mark it as orphaned to destroy the segment when - * kernel.shm_rmid_forced is changed. - * It is noop if the following shm_may_destroy() returns true. - */ - shp->shm_creator = NULL; - - /* - * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID - * is not set, it shouldn't be deleted here. - */ - if (!ns->shm_rmid_forced) - return; - - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); - } -} - /* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { @@ -325,20 +299,49 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) up_write(&shm_ids(ns).rwsem); } - +/* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { struct ipc_namespace *ns = task->nsproxy->ipc_ns; struct shmid_kernel *shp, *n; - if (shm_ids(ns).in_use == 0) + if (list_empty(&task->sysvshm.shm_clist)) + return; + + /* + * If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) { + down_read(&shm_ids(ns).rwsem); + list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) + shp->shm_creator = NULL; + /* + * Only under read lock but we are only called on current + * so no entry on the list will be shared. + */ + list_del(&task->sysvshm.shm_clist); + up_read(&shm_ids(ns).rwsem); return; + } - /* Destroy all already created segments, but not mapped yet */ + /* + * Destroy all already created segments, that were not yet mapped, + * and mark any mapped as orphan to cover the sysctl toggling. + * Destroy is skipped if shm_may_destroy() returns false. + */ down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) - shm_mark_orphan(shp, ns); - /* remove the list head from any segments still attached */ + list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { + shp->shm_creator = NULL; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + } + + /* Remove the list head from any segments still attached. */ list_del(&task->sysvshm.shm_clist); up_write(&shm_ids(ns).rwsem); } -- cgit