From 3854a771821c970065e3203a0b40ddc4101538cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:29 -0700 Subject: __exit_signal: don't take rcu lock There is no reason for rcu_read_lock() in __exit_signal(). tsk->sighand can only be changed if tsk does exec, obviously this is not possible. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 93d2711b9381..a7799d8a6404 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk) tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); -- cgit From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:37 -0700 Subject: introduce PF_KTHREAD flag Introduce the new PF_KTHREAD flag to mark the kernel threads. It is set by INIT_TASK() and copied to the forked childs (we could set it in kthreadd() along with PF_NOFREEZE instead). daemonize() was changed as well. In that case testing of PF_KTHREAD is racy, but daemonize() is hopeless anyway. This flag is cleared in do_execve(), before search_binary_handler(). Probably not the best place, we can do this in exec_mmap() or in start_thread(), or clear it along with PF_FORKNOEXEC. But I think this doesn't matter in practice, and if do_execve() fails kthread should die soon. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + include/linux/init_task.h | 2 +- include/linux/sched.h | 1 + kernel/exit.c | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index af249af4ccab..cd2e8c9b1249 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1326,6 +1326,7 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 93c45acf249a..021d8e720c79 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -122,7 +122,7 @@ extern struct group_info init_groups; .state = 0, \ .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ - .flags = 0, \ + .flags = PF_KTHREAD, \ .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 79e749dbf81e..eec64a4adb9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1483,6 +1483,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_KTHREAD 0x00000020 /* I am a kernel thread */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff --git a/kernel/exit.c b/kernel/exit.c index a7799d8a6404..28a44a2612dc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -430,7 +430,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); -- cgit From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: turn mm->core_startup_done into the pointer to struct core_state mm->core_startup_done points to "struct completion startup_done" allocated on the coredump_wait()'s stack. Introduce the new structure, core_state, which holds this "struct completion". This way we can add more info visible to the threads participating in coredump without enlarging mm_struct. No changes in affected .o files. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 ++++---- include/linux/mm_types.h | 7 ++++++- kernel/exit.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index e347e6ed1617..71734568f018 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1597,13 +1597,13 @@ static int coredump_wait(int exit_code) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion startup_done; + struct core_state core_state; struct completion *vfork_done; int core_waiters; init_completion(&mm->core_done); - init_completion(&startup_done); - mm->core_startup_done = &startup_done; + init_completion(&core_state.startup); + mm->core_state = &core_state; core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); @@ -1622,7 +1622,7 @@ static int coredump_wait(int exit_code) } if (core_waiters) - wait_for_completion(&startup_done); + wait_for_completion(&core_state.startup); fail: BUG_ON(mm->core_waiters); return core_waiters; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 02a27ae78539..97819efd2333 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,6 +159,10 @@ struct vm_area_struct { #endif }; +struct core_state { + struct completion startup; +}; + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -220,7 +224,8 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ /* coredumping support */ - struct completion *core_startup_done, core_done; + struct core_state *core_state; + struct completion core_done; /* aio bits */ rwlock_t ioctx_list_lock; /* aio lock */ diff --git a/kernel/exit.c b/kernel/exit.c index 28a44a2612dc..f7fa21dbced4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk) up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); if (!--mm->core_waiters) - complete(mm->core_startup_done); + complete(&mm->core_state->startup); up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); -- cgit From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:41 -0700 Subject: coredump: move mm->core_waiters into struct core_state Move mm->core_waiters into "struct core_state" allocated on stack. This shrinks mm_struct a little bit and allows further changes. This patch mostly does s/core_waiters/core_state. The only essential change is that coredump_wait() must clear mm->core_state before return. The coredump_wait()'s path is uglified and .text grows by 30 bytes, this is fixed by the next patch. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 21 +++++++++++---------- include/linux/mm_types.h | 2 +- kernel/exit.c | 8 ++++---- kernel/fork.c | 2 +- kernel/signal.c | 4 ++-- 5 files changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 71734568f018..50de3aaff4d0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -722,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm) * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_sem around - * checking core_waiters and changing tsk->mm. The - * core-inducing thread will increment core_waiters for - * each thread whose ->mm == old_mm. + * checking core_state and changing tsk->mm. */ down_read(&old_mm->mmap_sem); - if (unlikely(old_mm->core_waiters)) { + if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); return -EINTR; } @@ -1514,7 +1512,7 @@ static void zap_process(struct task_struct *start) t = start; do { if (t != current && t->mm) { - t->mm->core_waiters++; + t->mm->core_state->nr_threads++; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } @@ -1538,11 +1536,11 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (err) return err; - if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + if (atomic_read(&mm->mm_users) == mm->core_state->nr_threads + 1) goto done; /* * We should find and kill all tasks which use this mm, and we should - * count them correctly into mm->core_waiters. We don't take tasklist + * count them correctly into ->nr_threads. We don't take tasklist * lock, but this is safe wrt: * * fork: @@ -1590,7 +1588,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, } rcu_read_unlock(); done: - return mm->core_waiters; + return mm->core_state->nr_threads; } static int coredump_wait(int exit_code) @@ -1603,9 +1601,12 @@ static int coredump_wait(int exit_code) init_completion(&mm->core_done); init_completion(&core_state.startup); + core_state.nr_threads = 0; mm->core_state = &core_state; core_waiters = zap_threads(tsk, mm, exit_code); + if (core_waiters < 0) + mm->core_state = NULL; up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1623,8 +1624,8 @@ static int coredump_wait(int exit_code) if (core_waiters) wait_for_completion(&core_state.startup); + mm->core_state = NULL; fail: - BUG_ON(mm->core_waiters); return core_waiters; } @@ -1702,7 +1703,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_waiters || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm)) { up_write(&mm->mmap_sem); goto fail; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 97819efd2333..c0b1747b61a5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -160,6 +160,7 @@ struct vm_area_struct { }; struct core_state { + int nr_threads; struct completion startup; }; @@ -179,7 +180,6 @@ struct mm_struct { atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ - int core_waiters; struct rw_semaphore mmap_sem; spinlock_t page_table_lock; /* Protects page tables and some counters */ diff --git a/kernel/exit.c b/kernel/exit.c index f7fa21dbced4..988e232254e9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + if (mm->core_state) { up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); - if (!--mm->core_waiters) + if (!--mm->core_state->nr_threads) complete(&mm->core_state->startup); up_write(&mm->mmap_sem); diff --git a/kernel/fork.c b/kernel/fork.c index eeaec6893b0d..813d5c89b9d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; - mm->core_waiters = 0; + mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); diff --git a/kernel/signal.c b/kernel/signal.c index 39c1706edf03..5c7b7eaa0dc6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1480,10 +1480,10 @@ static inline int may_ptrace_stop(void) * is a deadlock situation, and pointless because our tracer * is dead so don't allow us to stop. * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_waiters != 0. Otherwise it + * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). */ - if (unlikely(current->mm->core_waiters) && + if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) return 0; -- cgit From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:42 -0700 Subject: coredump: turn core_state->nr_threads into atomic_t Turn core_state->nr_threads into atomic_t and kill now unneeded down_write(&mm->mmap_sem) in exit_mm(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/mm_types.h | 2 +- kernel/exit.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index c74bb34eeeff..15d493fe8aa3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1591,7 +1591,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, } rcu_read_unlock(); done: - core_state->nr_threads = nr; + atomic_set(&core_state->nr_threads, nr); return nr; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c0b1747b61a5..ae99a28ba6ae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -160,7 +160,7 @@ struct vm_area_struct { }; struct core_state { - int nr_threads; + atomic_t nr_threads; struct completion startup; }; diff --git a/kernel/exit.c b/kernel/exit.c index 988e232254e9..63d82957baae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); if (mm->core_state) { up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_state->nr_threads) + + if (atomic_dec_and_test(&mm->core_state->nr_threads)) complete(&mm->core_state->startup); - up_write(&mm->mmap_sem); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:44 -0700 Subject: coredump: construct the list of coredumping threads at startup time binfmt->core_dump() has to iterate over the all threads in system in order to find the coredumping threads and construct the list using the GFP_ATOMIC allocations. With this patch each thread allocates the list node on exit_mm()'s stack and adds itself to the list. This allows us to do further changes: - simplify ->core_dump() - change exit_mm() to clear ->mm first, then wait for ->core_done. this makes the coredumping process visible to oom_kill - kill mm->core_done Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ include/linux/mm_types.h | 6 ++++++ kernel/exit.c | 15 ++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index b8ee842d93cd..fe2873b8037f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1604,6 +1604,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) init_completion(&mm->core_done); init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ae99a28ba6ae..4d0d0abc79fe 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -159,8 +159,14 @@ struct vm_area_struct { #endif }; +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + struct core_state { atomic_t nr_threads; + struct core_thread dumper; struct completion startup; }; diff --git a/kernel/exit.c b/kernel/exit.c index 63d82957baae..b66f0d55c791 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -664,6 +664,7 @@ assign_new_owner: static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) @@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk) * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_state) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - if (atomic_dec_and_test(&mm->core_state->nr_threads)) - complete(&mm->core_state->startup); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); wait_for_completion(&mm->core_done); down_read(&mm->mmap_sem); -- cgit From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:46 -0700 Subject: coredump: kill mm->core_done Now that we have core_state->dumper list we can use it to wake up the sub-threads waiting for the coredump completion. This uglifies the code and .text grows by 47 bytes, but otoh mm_struct lessens by sizeof(struct completion). Also, with this change we can decouple exit_mm() from the coredumping code. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 25 ++++++++++++++++++++++--- include/linux/mm_types.h | 4 +--- kernel/exit.c | 8 +++++++- 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index fe2873b8037f..bff43aeb235e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1602,7 +1602,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) struct completion *vfork_done; int core_waiters; - init_completion(&mm->core_done); init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; @@ -1628,6 +1627,27 @@ fail: return core_waiters; } +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + /* * set_dumpable converts traditional three-value dumpable to two flags and * stores them into mm->flags. It modifies lower two bits of mm->flags, but @@ -1812,8 +1832,7 @@ fail_unlock: argv_free(helper_argv); current->fsuid = fsuid; - complete_all(&mm->core_done); - mm->core_state = NULL; + coredump_finish(mm); fail: return retval; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d0d0abc79fe..746f975b58ef 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -229,9 +229,7 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access the bits */ - /* coredumping support */ - struct core_state *core_state; - struct completion core_done; + struct core_state *core_state; /* coredumping support */ /* aio bits */ rwlock_t ioctx_list_lock; /* aio lock */ diff --git a/kernel/exit.c b/kernel/exit.c index b66f0d55c791..8a4d4d12e294 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk) if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); - wait_for_completion(&mm->core_done); + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); -- cgit From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 25 Jul 2008 01:48:49 -0700 Subject: task IO accounting: provide distinct tgid/tid I/O statistics Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate parent I/O statistics in /proc/pid/io. This approach follows the same model used to account per-process and per-thread CPU times. As a practial application, this allows for example to quickly find the top I/O consumer when a process spawns many child threads that perform the actual I/O work, because the aggregated I/O statistics can always be found in /proc/pid/io. [ Oleg Nesterov points out that we should check that the task is still alive before we iterate over the threads, but also says that we can do that fixup on top of this later. - Linus ] Acked-by: Balbir Singh Signed-off-by: Andrea Righi Cc: Matt Heaton Cc: Shailabh Nagar Acked-by-with-comments: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 86 ++++++++++++++++++++++++++++++++++++++++++--------- include/linux/sched.h | 4 +++ kernel/exit.c | 27 ++++++++++++++++ kernel/fork.c | 6 ++++ 4 files changed, 108 insertions(+), 15 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 58c3e6a8e15e..a891fe4cb43b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, } #ifdef CONFIG_TASK_IO_ACCOUNTING -static int proc_pid_io_accounting(struct task_struct *task, char *buffer) -{ +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; + + if (!whole) { + rchar = task->rchar; + wchar = task->wchar; + syscr = task->syscr; + syscw = task->syscw; + memcpy(&ioac, &task->ioac, sizeof(ioac)); + } else { + unsigned long flags; + struct task_struct *t = task; + rchar = wchar = syscr = syscw = 0; + memset(&ioac, 0, sizeof(ioac)); + + rcu_read_lock(); + do { + rchar += t->rchar; + wchar += t->wchar; + syscr += t->syscr; + syscw += t->syscw; + + ioac.read_bytes += t->ioac.read_bytes; + ioac.write_bytes += t->ioac.write_bytes; + ioac.cancelled_write_bytes += + t->ioac.cancelled_write_bytes; + t = next_thread(t); + } while (t != task); + rcu_read_unlock(); + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + rchar += sig->rchar; + wchar += sig->wchar; + syscr += sig->syscr; + syscw += sig->syscw; + + ioac.read_bytes += sig->ioac.read_bytes; + ioac.write_bytes += sig->ioac.write_bytes; + ioac.cancelled_write_bytes += + sig->ioac.cancelled_write_bytes; + + unlock_task_sighand(task, &flags); + } + } + return sprintf(buffer, -#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" -#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", -#ifdef CONFIG_TASK_XACCT - (unsigned long long)task->rchar, - (unsigned long long)task->wchar, - (unsigned long long)task->syscr, - (unsigned long long)task->syscw, -#endif - (unsigned long long)task->ioac.read_bytes, - (unsigned long long)task->ioac.write_bytes, - (unsigned long long)task->ioac.cancelled_write_bytes); + (unsigned long long)rchar, + (unsigned long long)wchar, + (unsigned long long)syscr, + (unsigned long long)syscw, + (unsigned long long)ioac.read_bytes, + (unsigned long long)ioac.write_bytes, + (unsigned long long)ioac.cancelled_write_bytes); +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); } -#endif + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ /* * Thread groups @@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, pid_io_accounting), + INF("io", S_IRUGO, tgid_io_accounting), #endif }; @@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, tid_io_accounting), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/include/linux/sched.h b/include/linux/sched.h index af780f299c7c..d22ffe06d0eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -506,6 +506,10 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; +#ifdef CONFIG_TASK_XACCT + u64 rchar, wchar, syscr, syscw; +#endif + struct task_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the diff --git a/kernel/exit.c b/kernel/exit.c index 8a4d4d12e294..ad933bb29ec7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); +#ifdef CONFIG_TASK_XACCT + sig->rchar += tsk->rchar; + sig->wchar += tsk->wchar; + sig->syscr += tsk->syscr; + sig->syscw += tsk->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + sig->ioac.read_bytes += tsk->ioac.read_bytes; + sig->ioac.write_bytes += tsk->ioac.write_bytes; + sig->ioac.cancelled_write_bytes += + tsk->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; +#ifdef CONFIG_TASK_XACCT + psig->rchar += p->rchar + sig->rchar; + psig->wchar += p->wchar + sig->wchar; + psig->syscr += p->syscr + sig->syscr; + psig->syscw += p->syscw + sig->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + psig->ioac.read_bytes += + p->ioac.read_bytes + sig->ioac.read_bytes; + psig->ioac.write_bytes += + p->ioac.write_bytes + sig->ioac.write_bytes; + psig->ioac.cancelled_write_bytes += + p->ioac.cancelled_write_bytes + + sig->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 813d5c89b9d5..b99d73e971a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; +#ifdef CONFIG_TASK_XACCT + sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + memset(&sig->ioac, 0, sizeof(sig->ioac)); +#endif sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); -- cgit From 30199f5a46aee204bf437a4f5b0740f3efe448b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:46 -0700 Subject: tracehook: exit This moves the PTRACE_EVENT_EXIT tracing into a tracehook.h inline, tracehook_report_exec(). The change has no effect, just clean-up. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 15 +++++++++++++++ kernel/exit.c | 6 ++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 6276353709c1..967ab473afbc 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -95,4 +95,19 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt, send_sig(SIGTRAP, current, 0); } +/** + * tracehook_report_exit - task has begun to exit + * @exit_code: pointer to value destined for @current->exit_code + * + * @exit_code points to the value passed to do_exit(), which tracing + * might change here. This is almost the first thing in do_exit(), + * before freeing any resources or setting the %PF_EXITING flag. + * + * Called with no locks held. + */ +static inline void tracehook_report_exit(long *exit_code) +{ + ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code); +} + #endif /* */ diff --git a/kernel/exit.c b/kernel/exit.c index ad933bb29ec7..c3691cbc220a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -1029,10 +1030,7 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just -- cgit From dae33574dcf5211e1f43c7e45fa29f73ba3e00cb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:48 -0700 Subject: tracehook: release_task This moves the ptrace-related logic from release_task into tracehook.h and ptrace.h inlines. It provides clean hooks both before and after locking tasklist_lock, for future tracing logic to do more cleanup without the lock. This also changes release_task() itself in the rare "zap_leader" case to set the leader to EXIT_DEAD before iterating. This maintains the invariant that release_task() only ever handles a task in EXIT_DEAD. This is a common-sense invariant that is already always true except in this one arcane case of zombie leader whose parent ignores SIGCHLD. This change is harmless and only costs one store in this one rare case. It keeps the expected state more consisently sane, which is nicer when debugging weirdness in release_task(). It also lets some future code in the tracehook entry points rely on this invariant for bookkeeping. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 13 +++++++++++++ include/linux/tracehook.h | 28 ++++++++++++++++++++++++++++ kernel/exit.c | 21 +++++++++------------ 3 files changed, 50 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index dae6d85520fb..ed69c03692d9 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -176,6 +176,19 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) } } +/** + * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped + * @task: task in %EXIT_DEAD state + * + * Called with write_lock(&tasklist_lock) held. + */ +static inline void ptrace_release_task(struct task_struct *task) +{ + BUG_ON(!list_empty(&task->ptraced)); + ptrace_unlink(task); + BUG_ON(!list_empty(&task->ptrace_entry)); +} + #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 830e6e16097d..9a5b3be2503a 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -228,4 +228,32 @@ static inline void tracehook_report_vfork_done(struct task_struct *child, ptrace_event(PT_TRACE_VFORK_DONE, PTRACE_EVENT_VFORK_DONE, pid); } +/** + * tracehook_prepare_release_task - task is being reaped, clean up tracing + * @task: task in %EXIT_DEAD state + * + * This is called in release_task() just before @task gets finally reaped + * and freed. This would be the ideal place to remove and clean up any + * tracing-related state for @task. + * + * Called with no locks held. + */ +static inline void tracehook_prepare_release_task(struct task_struct *task) +{ +} + +/** + * tracehook_finish_release_task - task is being reaped, clean up tracing + * @task: task in %EXIT_DEAD state + * + * This is called in release_task() when @task is being in the middle of + * being reaped. After this, there must be no tracing entanglements. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static inline void tracehook_finish_release_task(struct task_struct *task) +{ + ptrace_release_task(task); +} + #endif /* */ diff --git a/kernel/exit.c b/kernel/exit.c index c3691cbc220a..da28745f7c38 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -163,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } -/* - * Do final ptrace-related cleanup of a zombie being reaped. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_release_task(struct task_struct *p) -{ - BUG_ON(!list_empty(&p->ptraced)); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_entry)); -} void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: + tracehook_prepare_release_task(p); atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_release_task(p); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -205,6 +195,13 @@ repeat: * that case. */ zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); -- cgit From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:54 -0700 Subject: tracehook: death This moves the ptrace logic in task death (exit_notify) into tracehook.h inlines. Some code is rearranged slightly to make things nicer. There is no change, only cleanup. There is one hook called with the tasklist_lock write-locked, as ptrace needs. There is also a new hook called after exit_state changes and without locks. This is a better place for tracing work to be in the future, since it doesn't delay the whole system with locking. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/tracehook.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/exit.c | 26 ++++++++---------------- kernel/signal.c | 10 ++++++--- 4 files changed, 69 insertions(+), 21 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index adb8077dc463..a95d84d0da95 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1796,7 +1796,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void do_notify_parent(struct task_struct *, int); +extern int do_notify_parent(struct task_struct *, int); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 6dc428dd2f38..4c50e1b57349 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -471,4 +471,56 @@ static inline int tracehook_notify_jctl(int notify, int why) return notify || (current->ptrace & PT_PTRACED); } +/** + * tracehook_notify_death - task is dead, ready to notify parent + * @task: @current task now exiting + * @death_cookie: value to pass to tracehook_report_death() + * @group_dead: nonzero if this was the last thread in the group to die + * + * Return the signal number to send our parent with do_notify_parent(), or + * zero to send no signal and leave a zombie, or -1 to self-reap right now. + * + * Called with write_lock_irq(&tasklist_lock) held. + */ +static inline int tracehook_notify_death(struct task_struct *task, + void **death_cookie, int group_dead) +{ + if (task->exit_signal == -1) + return task->ptrace ? SIGCHLD : -1; + + /* + * If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. + */ + if (thread_group_empty(task) && !ptrace_reparented(task)) + return task->exit_signal; + + return task->ptrace ? SIGCHLD : 0; +} + +/** + * tracehook_report_death - task is dead and ready to be reaped + * @task: @current task now exiting + * @signal: signal number sent to parent, or 0 or -1 + * @death_cookie: value passed back from tracehook_notify_death() + * @group_dead: nonzero if this was the last thread in the group to die + * + * Thread has just become a zombie or is about to self-reap. If positive, + * @signal is the signal number just sent to the parent (usually %SIGCHLD). + * If @signal is -1, this thread will self-reap. If @signal is 0, this is + * a delayed_group_leader() zombie. The @death_cookie was passed back by + * tracehook_notify_death(). + * + * If normal reaping is not inhibited, @task->exit_state might be changing + * in parallel. + * + * Called without locks. + */ +static inline void tracehook_report_death(struct task_struct *task, + int signal, void *death_cookie, + int group_dead) +{ +} + #endif /* */ diff --git a/kernel/exit.c b/kernel/exit.c index da28745f7c38..6cdf60712bd2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal > 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal < 0) release_task(tsk); } diff --git a/kernel/signal.c b/kernel/signal.c index e9e699f4b1bd..0e862d3130ff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p, /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. */ - -void do_notify_parent(struct task_struct *tsk, int sig) +int do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; @@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) */ tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; + sig = -1; } if (valid_signal(sig) && sig > 0) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); + + return sig; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) -- cgit From 7f2da1e7d0330395e5e9e350b879b98a1ea495df Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 10 May 2008 20:44:54 -0400 Subject: [PATCH] kill altroot long overdue... Signed-off-by: Al Viro --- fs/namei.c | 89 +------------------------------------------ fs/namespace.c | 8 +--- fs/open.c | 3 +- include/asm-alpha/namei.h | 17 --------- include/asm-arm/namei.h | 25 ------------ include/asm-avr32/namei.h | 7 ---- include/asm-blackfin/namei.h | 19 --------- include/asm-cris/namei.h | 17 --------- include/asm-frv/namei.h | 18 --------- include/asm-h8300/namei.h | 17 --------- include/asm-ia64/namei.h | 25 ------------ include/asm-m32r/namei.h | 17 --------- include/asm-m68k/namei.h | 17 --------- include/asm-m68knommu/namei.h | 1 - include/asm-mips/namei.h | 11 ------ include/asm-mn10300/namei.h | 22 ----------- include/asm-parisc/namei.h | 17 --------- include/asm-powerpc/namei.h | 20 ---------- include/asm-s390/namei.h | 21 ---------- include/asm-sh/namei.h | 17 --------- include/asm-sparc/namei.h | 8 ---- include/asm-sparc64/namei.h | 1 - include/asm-um/namei.h | 6 --- include/asm-v850/namei.h | 17 --------- include/asm-x86/namei.h | 11 ------ include/asm-xtensa/namei.h | 26 ------------- include/linux/fs_struct.h | 3 +- include/linux/namei.h | 1 - kernel/exec_domain.c | 1 - kernel/exit.c | 2 - kernel/fork.c | 7 ---- 31 files changed, 5 insertions(+), 466 deletions(-) delete mode 100644 include/asm-alpha/namei.h delete mode 100644 include/asm-arm/namei.h delete mode 100644 include/asm-avr32/namei.h delete mode 100644 include/asm-blackfin/namei.h delete mode 100644 include/asm-cris/namei.h delete mode 100644 include/asm-frv/namei.h delete mode 100644 include/asm-h8300/namei.h delete mode 100644 include/asm-ia64/namei.h delete mode 100644 include/asm-m32r/namei.h delete mode 100644 include/asm-m68k/namei.h delete mode 100644 include/asm-m68knommu/namei.h delete mode 100644 include/asm-mips/namei.h delete mode 100644 include/asm-mn10300/namei.h delete mode 100644 include/asm-parisc/namei.h delete mode 100644 include/asm-powerpc/namei.h delete mode 100644 include/asm-s390/namei.h delete mode 100644 include/asm-sh/namei.h delete mode 100644 include/asm-sparc/namei.h delete mode 100644 include/asm-sparc64/namei.h delete mode 100644 include/asm-um/namei.h delete mode 100644 include/asm-v850/namei.h delete mode 100644 include/asm-x86/namei.h delete mode 100644 include/asm-xtensa/namei.h (limited to 'kernel/exit.c') diff --git a/fs/namei.c b/fs/namei.c index 6c76e1ee9c45..095818089ac1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) @@ -562,27 +561,16 @@ out_unlock: return result; } -static int __emul_lookup_dentry(const char *, struct nameidata *); - /* SMP-safe */ -static __always_inline int +static __always_inline void walk_init_root(const char *name, struct nameidata *nd) { struct fs_struct *fs = current->fs; read_lock(&fs->lock); - if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) { - nd->path = fs->altroot; - path_get(&fs->altroot); - read_unlock(&fs->lock); - if (__emul_lookup_dentry(name,nd)) - return 0; - read_lock(&fs->lock); - } nd->path = fs->root; path_get(&fs->root); read_unlock(&fs->lock); - return 1; } /* @@ -623,12 +611,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l if (*link == '/') { path_put(&nd->path); - if (!walk_init_root(link, nd)) - /* weird __emul_prefix() stuff did it */ - goto out; + walk_init_root(link, nd); } res = link_path_walk(link, nd); -out: if (nd->depth || res || nd->last_type!=LAST_NORM) return res; /* @@ -1077,67 +1062,6 @@ static int path_walk(const char *name, struct nameidata *nd) return link_path_walk(name, nd); } -/* - * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if - * everything is done. Returns 0 and drops input nd, if lookup failed; - */ -static int __emul_lookup_dentry(const char *name, struct nameidata *nd) -{ - if (path_walk(name, nd)) - return 0; /* something went wrong... */ - - if (!nd->path.dentry->d_inode || - S_ISDIR(nd->path.dentry->d_inode->i_mode)) { - struct path old_path = nd->path; - struct qstr last = nd->last; - int last_type = nd->last_type; - struct fs_struct *fs = current->fs; - - /* - * NAME was not found in alternate root or it's a directory. - * Try to find it in the normal root: - */ - nd->last_type = LAST_ROOT; - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); - if (path_walk(name, nd) == 0) { - if (nd->path.dentry->d_inode) { - path_put(&old_path); - return 1; - } - path_put(&nd->path); - } - nd->path = old_path; - nd->last = last; - nd->last_type = last_type; - } - return 1; -} - -void set_fs_altroot(void) -{ - char *emul = __emul_prefix(); - struct nameidata nd; - struct path path = {}, old_path; - int err; - struct fs_struct *fs = current->fs; - - if (!emul) - goto set_it; - err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd); - if (!err) - path = nd.path; -set_it: - write_lock(&fs->lock); - old_path = fs->altroot; - fs->altroot = path; - write_unlock(&fs->lock); - if (old_path.dentry) - path_put(&old_path); -} - /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) @@ -1153,14 +1077,6 @@ static int do_path_lookup(int dfd, const char *name, if (*name=='/') { read_lock(&fs->lock); - if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) { - nd->path = fs->altroot; - path_get(&fs->altroot); - read_unlock(&fs->lock); - if (__emul_lookup_dentry(name,nd)) - goto out; /* found in altroot */ - read_lock(&fs->lock); - } nd->path = fs->root; path_get(&fs->root); read_unlock(&fs->lock); @@ -1194,7 +1110,6 @@ static int do_path_lookup(int dfd, const char *name, } retval = path_walk(name, nd); -out: if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); diff --git a/fs/namespace.c b/fs/namespace.c index f30b11e2240e..c4fcf48acef8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1972,7 +1972,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, struct fs_struct *fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct vfsmount *p, *q; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); @@ -2015,10 +2015,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, pwdmnt = p; fs->pwd.mnt = mntget(q); } - if (p == fs->altroot.mnt) { - altrootmnt = p; - fs->altroot.mnt = mntget(q); - } } p = next_mnt(p, mnt_ns->root); q = next_mnt(q, new_ns->root); @@ -2029,8 +2025,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); - if (altrootmnt) - mntput(altrootmnt); return new_ns; } diff --git a/fs/open.c b/fs/open.c index 8e02d42bfe44..d3a2a00f52dc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -548,7 +548,7 @@ asmlinkage long sys_chroot(const char __user * filename) struct nameidata nd; int error; - error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd); if (error) goto out; @@ -561,7 +561,6 @@ asmlinkage long sys_chroot(const char __user * filename) goto dput_and_out; set_fs_root(current->fs, &nd.path); - set_fs_altroot(); error = 0; dput_and_out: path_put(&nd.path); diff --git a/include/asm-alpha/namei.h b/include/asm-alpha/namei.h deleted file mode 100644 index 5cc9bb39499d..000000000000 --- a/include/asm-alpha/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* $Id: namei.h,v 1.1 1996/12/13 14:48:21 jj Exp $ - * linux/include/asm-alpha/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __ALPHA_NAMEI_H -#define __ALPHA_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __ALPHA_NAMEI_H */ diff --git a/include/asm-arm/namei.h b/include/asm-arm/namei.h deleted file mode 100644 index a402d3b9d0f7..000000000000 --- a/include/asm-arm/namei.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * linux/include/asm-arm/namei.h - * - * Routines to handle famous /usr/gnemul - * Derived from the Sparc version of this file - * - * Included from linux/fs/namei.c - */ - -#ifndef __ASMARM_NAMEI_H -#define __ASMARM_NAMEI_H - -#define ARM_BSD_EMUL "usr/gnemul/bsd/" - -static inline char *__emul_prefix(void) -{ - switch (current->personality) { - case PER_BSD: - return ARM_BSD_EMUL; - default: - return NULL; - } -} - -#endif /* __ASMARM_NAMEI_H */ diff --git a/include/asm-avr32/namei.h b/include/asm-avr32/namei.h deleted file mode 100644 index f0a26de06cab..000000000000 --- a/include/asm-avr32/namei.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_AVR32_NAMEI_H -#define __ASM_AVR32_NAMEI_H - -/* This dummy routine may be changed to something useful */ -#define __emul_prefix() NULL - -#endif /* __ASM_AVR32_NAMEI_H */ diff --git a/include/asm-blackfin/namei.h b/include/asm-blackfin/namei.h deleted file mode 100644 index 8b89a2d65cb4..000000000000 --- a/include/asm-blackfin/namei.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * linux/include/asm/namei.h - * - * Included from linux/fs/namei.c - * - * Changes made by Lineo Inc. May 2001 - */ - -#ifndef __BFIN_NAMEI_H -#define __BFIN_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif diff --git a/include/asm-cris/namei.h b/include/asm-cris/namei.h deleted file mode 100644 index 8a3be7a6d9f6..000000000000 --- a/include/asm-cris/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* $Id: namei.h,v 1.1 2000/07/10 16:32:31 bjornw Exp $ - * linux/include/asm-cris/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __CRIS_NAMEI_H -#define __CRIS_NAMEI_H - -/* used to find file-system prefixes for doing emulations - * see for example asm-sparc/namei.h - * we don't use it... - */ - -#define __emul_prefix() NULL - -#endif /* __CRIS_NAMEI_H */ diff --git a/include/asm-frv/namei.h b/include/asm-frv/namei.h deleted file mode 100644 index 4ea57171d951..000000000000 --- a/include/asm-frv/namei.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * include/asm-frv/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __ASM_NAMEI_H -#define __ASM_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif - diff --git a/include/asm-h8300/namei.h b/include/asm-h8300/namei.h deleted file mode 100644 index ab6f196db6e0..000000000000 --- a/include/asm-h8300/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * linux/include/asm-h8300/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __H8300_NAMEI_H -#define __H8300_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif diff --git a/include/asm-ia64/namei.h b/include/asm-ia64/namei.h deleted file mode 100644 index 78e768079083..000000000000 --- a/include/asm-ia64/namei.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _ASM_IA64_NAMEI_H -#define _ASM_IA64_NAMEI_H - -/* - * Modified 1998, 1999, 2001 - * David Mosberger-Tang , Hewlett-Packard Co - */ - -#include -#include - -#define EMUL_PREFIX_LINUX_IA32 "/emul/ia32-linux/" - -static inline char * -__emul_prefix (void) -{ - switch (current->personality) { - case PER_LINUX32: - return EMUL_PREFIX_LINUX_IA32; - default: - return NULL; - } -} - -#endif /* _ASM_IA64_NAMEI_H */ diff --git a/include/asm-m32r/namei.h b/include/asm-m32r/namei.h deleted file mode 100644 index 210f8056b805..000000000000 --- a/include/asm-m32r/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _ASM_M32R_NAMEI_H -#define _ASM_M32R_NAMEI_H - -/* - * linux/include/asm-m32r/namei.h - * - * Included from linux/fs/namei.c - */ - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* _ASM_M32R_NAMEI_H */ diff --git a/include/asm-m68k/namei.h b/include/asm-m68k/namei.h deleted file mode 100644 index f33f243b644a..000000000000 --- a/include/asm-m68k/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * linux/include/asm-m68k/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __M68K_NAMEI_H -#define __M68K_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif diff --git a/include/asm-m68knommu/namei.h b/include/asm-m68knommu/namei.h deleted file mode 100644 index 31a85d27b931..000000000000 --- a/include/asm-m68knommu/namei.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-mips/namei.h b/include/asm-mips/namei.h deleted file mode 100644 index a6605a752469..000000000000 --- a/include/asm-mips/namei.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _ASM_NAMEI_H -#define _ASM_NAMEI_H - -/* - * This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - */ - -#define __emul_prefix() NULL - -#endif /* _ASM_NAMEI_H */ diff --git a/include/asm-mn10300/namei.h b/include/asm-mn10300/namei.h deleted file mode 100644 index bd9ce94aeb65..000000000000 --- a/include/asm-mn10300/namei.h +++ /dev/null @@ -1,22 +0,0 @@ -/* Emulation stuff - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef _ASM_NAMEI_H -#define _ASM_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* _ASM_NAMEI_H */ diff --git a/include/asm-parisc/namei.h b/include/asm-parisc/namei.h deleted file mode 100644 index 8d29b3d9fb33..000000000000 --- a/include/asm-parisc/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* $Id: namei.h,v 1.1 1996/12/13 14:48:21 jj Exp $ - * linux/include/asm-parisc/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __PARISC_NAMEI_H -#define __PARISC_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __PARISC_NAMEI_H */ diff --git a/include/asm-powerpc/namei.h b/include/asm-powerpc/namei.h deleted file mode 100644 index 657443474a6a..000000000000 --- a/include/asm-powerpc/namei.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _ASM_POWERPC_NAMEI_H -#define _ASM_POWERPC_NAMEI_H - -#ifdef __KERNEL__ - -/* - * Adapted from include/asm-alpha/namei.h - * - * Included from fs/namei.c - */ - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_NAMEI_H */ diff --git a/include/asm-s390/namei.h b/include/asm-s390/namei.h deleted file mode 100644 index 3e286bdde4b0..000000000000 --- a/include/asm-s390/namei.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * include/asm-s390/namei.h - * - * S390 version - * - * Derived from "include/asm-i386/namei.h" - * - * Included from linux/fs/namei.c - */ - -#ifndef __S390_NAMEI_H -#define __S390_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __S390_NAMEI_H */ diff --git a/include/asm-sh/namei.h b/include/asm-sh/namei.h deleted file mode 100644 index 338a5d947143..000000000000 --- a/include/asm-sh/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* $Id: namei.h,v 1.3 2000/07/04 06:24:49 gniibe Exp $ - * linux/include/asm-sh/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __ASM_SH_NAMEI_H -#define __ASM_SH_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __ASM_SH_NAMEI_H */ diff --git a/include/asm-sparc/namei.h b/include/asm-sparc/namei.h deleted file mode 100644 index eff944b8e321..000000000000 --- a/include/asm-sparc/namei.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef ___ASM_SPARC_NAMEI_H -#define ___ASM_SPARC_NAMEI_H -#if defined(__sparc__) && defined(__arch64__) -#include -#else -#include -#endif -#endif diff --git a/include/asm-sparc64/namei.h b/include/asm-sparc64/namei.h deleted file mode 100644 index 1344a910ba2f..000000000000 --- a/include/asm-sparc64/namei.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/include/asm-um/namei.h b/include/asm-um/namei.h deleted file mode 100644 index 002984d5bc85..000000000000 --- a/include/asm-um/namei.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __UM_NAMEI_H -#define __UM_NAMEI_H - -#include "asm/arch/namei.h" - -#endif diff --git a/include/asm-v850/namei.h b/include/asm-v850/namei.h deleted file mode 100644 index ee8339b23843..000000000000 --- a/include/asm-v850/namei.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * linux/include/asm-v850/namei.h - * - * Included from linux/fs/namei.c - */ - -#ifndef __V850_NAMEI_H__ -#define __V850_NAMEI_H__ - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __V850_NAMEI_H__ */ diff --git a/include/asm-x86/namei.h b/include/asm-x86/namei.h deleted file mode 100644 index 415ef5d9550e..000000000000 --- a/include/asm-x86/namei.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _ASM_X86_NAMEI_H -#define _ASM_X86_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* _ASM_X86_NAMEI_H */ diff --git a/include/asm-xtensa/namei.h b/include/asm-xtensa/namei.h deleted file mode 100644 index 3fdff039d27d..000000000000 --- a/include/asm-xtensa/namei.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * include/asm-xtensa/namei.h - * - * Included from linux/fs/namei.c - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_NAMEI_H -#define _XTENSA_NAMEI_H - -#ifdef __KERNEL__ - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __KERNEL__ */ -#endif /* _XTENSA_NAMEI_H */ diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 282f54219129..9e5a06e78d02 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -7,7 +7,7 @@ struct fs_struct { atomic_t count; rwlock_t lock; int umask; - struct path root, pwd, altroot; + struct path root, pwd; }; #define INIT_FS { \ @@ -19,7 +19,6 @@ struct fs_struct { extern struct kmem_cache *fs_cachep; extern void exit_fs(struct task_struct *); -extern void set_fs_altroot(void); extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); diff --git a/include/linux/namei.h b/include/linux/namei.h index 3cf62d26d493..768773d57857 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -47,7 +47,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_DIRECTORY 2 #define LOOKUP_CONTINUE 4 #define LOOKUP_PARENT 16 -#define LOOKUP_NOALT 32 #define LOOKUP_REVAL 64 /* * Intent data diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c1ef192aa655..0d407e886735 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -168,7 +168,6 @@ __set_personality(u_long personality) current->personality = personality; oep = current_thread_info()->exec_domain; current_thread_info()->exec_domain = ep; - set_fs_altroot(); module_put(oep->module); return 0; diff --git a/kernel/exit.c b/kernel/exit.c index 6cdf60712bd2..0caf590548a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -565,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs) if (atomic_dec_and_test(&fs->count)) { path_put(&fs->root); path_put(&fs->pwd); - if (fs->altroot.dentry) - path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } diff --git a/kernel/fork.c b/kernel/fork.c index abb3ed6298f6..5e050c1317c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -657,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) path_get(&old->root); fs->pwd = old->pwd; path_get(&old->pwd); - if (old->altroot.dentry) { - fs->altroot = old->altroot; - path_get(&old->altroot); - } else { - fs->altroot.mnt = NULL; - fs->altroot.dentry = NULL; - } read_unlock(&old->lock); } return fs; -- cgit From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 27 Jul 2008 17:29:15 +0200 Subject: task IO accounting: improve code readability Put all i/o statistics in struct proc_io_accounting and use inline functions to initialize and increment statistics, removing a lot of single variable assignments. This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and CONFIG_TASK_IO_ACCOUNTING=y). text data bss dec hex filename 11651 0 0 11651 2d83 kernel/exit.o.before 11619 0 0 11619 2d63 kernel/exit.o.after 10886 132 136 11154 2b92 kernel/fork.o.before 10758 132 136 11026 2b12 kernel/fork.o.after 3082029 807968 4818600 8708597 84e1f5 vmlinux.o.before 3081869 807968 4818600 8708437 84e155 vmlinux.o.after Signed-off-by: Andrea Righi Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 57 ++++++++++------------------------ include/linux/sched.h | 19 ++++-------- include/linux/task_io_accounting.h | 27 ++++++++++++++-- include/linux/task_io_accounting_ops.h | 56 +++++++++++++++++++++++++++------ kernel/exit.c | 30 ++---------------- kernel/fork.c | 15 ++------- kernel/tsacct.c | 14 ++++----- 7 files changed, 104 insertions(+), 114 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index e74308bdabd3..3d94906c7aa8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -2402,44 +2403,17 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { - u64 rchar, wchar, syscr, syscw; - struct task_io_accounting ioac; - - rchar = task->rchar; - wchar = task->wchar; - syscr = task->syscr; - syscw = task->syscw; - memcpy(&ioac, &task->ioac, sizeof(ioac)); - - if (whole) { - unsigned long flags; - - if (lock_task_sighand(task, &flags)) { - struct signal_struct *sig = task->signal; - struct task_struct *t = task; - - rchar += sig->rchar; - wchar += sig->wchar; - syscr += sig->syscr; - syscw += sig->syscw; - - ioac.read_bytes += sig->ioac.read_bytes; - ioac.write_bytes += sig->ioac.write_bytes; - ioac.cancelled_write_bytes += - sig->ioac.cancelled_write_bytes; - while_each_thread(task, t) { - rchar += t->rchar; - wchar += t->wchar; - syscr += t->syscr; - syscw += t->syscw; - - ioac.read_bytes += t->ioac.read_bytes; - ioac.write_bytes += t->ioac.write_bytes; - ioac.cancelled_write_bytes += - t->ioac.cancelled_write_bytes; - } - unlock_task_sighand(task, &flags); - } + struct proc_io_accounting acct = task->ioac; + unsigned long flags; + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); } return sprintf(buffer, "rchar: %llu\n" @@ -2449,9 +2423,10 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", - rchar, wchar, syscr, syscw, - ioac.read_bytes, ioac.write_bytes, - ioac.cancelled_write_bytes); + acct.chr.rchar, acct.chr.wchar, + acct.chr.syscr, acct.chr.syscw, + acct.blk.read_bytes, acct.blk.write_bytes, + acct.blk.cancelled_write_bytes); } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) diff --git a/include/linux/sched.h b/include/linux/sched.h index f59318a0099b..034c1ca6b332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -505,10 +505,7 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; -#ifdef CONFIG_TASK_XACCT - u64 rchar, wchar, syscr, syscw; -#endif - struct task_io_accounting ioac; + struct proc_io_accounting ioac; /* * Cumulative ns of scheduled CPU time for dead threads in the @@ -1256,11 +1253,7 @@ struct task_struct { unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ -#ifdef CONFIG_TASK_XACCT -/* i/o counters(bytes read/written, #syscalls */ - u64 rchar, wchar, syscr, syscw; -#endif - struct task_io_accounting ioac; + struct proc_io_accounting ioac; #if defined(CONFIG_TASK_XACCT) u64 acct_rss_mem1; /* accumulated rss usage */ u64 acct_vm_mem1; /* accumulated virtual memory usage */ @@ -2190,22 +2183,22 @@ extern long sched_group_rt_period(struct task_group *tg); #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { - tsk->rchar += amt; + tsk->ioac.chr.rchar += amt; } static inline void add_wchar(struct task_struct *tsk, ssize_t amt) { - tsk->wchar += amt; + tsk->ioac.chr.wchar += amt; } static inline void inc_syscr(struct task_struct *tsk) { - tsk->syscr++; + tsk->ioac.chr.syscr++; } static inline void inc_syscw(struct task_struct *tsk) { - tsk->syscw++; + tsk->ioac.chr.syscw++; } #else static inline void add_rchar(struct task_struct *tsk, ssize_t amt) diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h index 44d00e9cceea..165390f8b936 100644 --- a/include/linux/task_io_accounting.h +++ b/include/linux/task_io_accounting.h @@ -1,5 +1,5 @@ /* - * task_io_accounting: a structure which is used for recording a single task's + * proc_io_accounting: a structure which is used for recording a single task's * IO statistics. * * Don't include this header file directly - it is designed to be dragged in via @@ -8,6 +8,22 @@ * Blame akpm@osdl.org for all this. */ +#ifdef CONFIG_TASK_XACCT +struct task_chr_io_accounting { + /* bytes read */ + u64 rchar; + /* bytes written */ + u64 wchar; + /* # of read syscalls */ + u64 syscr; + /* # of write syscalls */ + u64 syscw; +}; +#else /* CONFIG_TASK_XACCT */ +struct task_chr_io_accounting { +}; +#endif /* CONFIG_TASK_XACCT */ + #ifdef CONFIG_TASK_IO_ACCOUNTING struct task_io_accounting { /* @@ -31,7 +47,12 @@ struct task_io_accounting { */ u64 cancelled_write_bytes; }; -#else +#else /* CONFIG_TASK_IO_ACCOUNTING */ struct task_io_accounting { }; -#endif +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +struct proc_io_accounting { + struct task_chr_io_accounting chr; + struct task_io_accounting blk; +}; diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h index ff46c6fad79d..e6f958ebe97f 100644 --- a/include/linux/task_io_accounting_ops.h +++ b/include/linux/task_io_accounting_ops.h @@ -9,7 +9,7 @@ #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { - current->ioac.read_bytes += bytes; + current->ioac.blk.read_bytes += bytes; } /* @@ -18,12 +18,12 @@ static inline void task_io_account_read(size_t bytes) */ static inline unsigned long task_io_get_inblock(const struct task_struct *p) { - return p->ioac.read_bytes >> 9; + return p->ioac.blk.read_bytes >> 9; } static inline void task_io_account_write(size_t bytes) { - current->ioac.write_bytes += bytes; + current->ioac.blk.write_bytes += bytes; } /* @@ -32,17 +32,25 @@ static inline void task_io_account_write(size_t bytes) */ static inline unsigned long task_io_get_oublock(const struct task_struct *p) { - return p->ioac.write_bytes >> 9; + return p->ioac.blk.write_bytes >> 9; } static inline void task_io_account_cancelled_write(size_t bytes) { - current->ioac.cancelled_write_bytes += bytes; + current->ioac.blk.cancelled_write_bytes += bytes; } -static inline void task_io_accounting_init(struct task_struct *tsk) +static inline void task_io_accounting_init(struct proc_io_accounting *ioac) { - memset(&tsk->ioac, 0, sizeof(tsk->ioac)); + memset(ioac, 0, sizeof(*ioac)); +} + +static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + dst->blk.read_bytes += src->blk.read_bytes; + dst->blk.write_bytes += src->blk.write_bytes; + dst->blk.cancelled_write_bytes += src->blk.cancelled_write_bytes; } #else @@ -69,9 +77,37 @@ static inline void task_io_account_cancelled_write(size_t bytes) { } -static inline void task_io_accounting_init(struct task_struct *tsk) +static inline void task_io_accounting_init(struct proc_io_accounting *ioac) +{ +} + +static inline void task_blk_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) { } -#endif /* CONFIG_TASK_IO_ACCOUNTING */ -#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +#ifdef CONFIG_TASK_XACCT +static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + dst->chr.rchar += src->chr.rchar; + dst->chr.wchar += src->chr.wchar; + dst->chr.syscr += src->chr.syscr; + dst->chr.syscw += src->chr.syscw; +} +#else +static inline void task_chr_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ +} +#endif /* CONFIG_TASK_XACCT */ + +static inline void task_io_accounting_add(struct proc_io_accounting *dst, + struct proc_io_accounting *src) +{ + task_chr_io_accounting_add(dst, src); + task_blk_io_accounting_add(dst, src); +} +#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */ diff --git a/kernel/exit.c b/kernel/exit.c index 0caf590548a0..eb4d6470d1d0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); -#ifdef CONFIG_TASK_XACCT - sig->rchar += tsk->rchar; - sig->wchar += tsk->wchar; - sig->syscr += tsk->syscr; - sig->syscw += tsk->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - sig->ioac.read_bytes += tsk->ioac.read_bytes; - sig->ioac.write_bytes += tsk->ioac.write_bytes; - sig->ioac.cancelled_write_bytes += - tsk->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; -#ifdef CONFIG_TASK_XACCT - psig->rchar += p->rchar + sig->rchar; - psig->wchar += p->wchar + sig->wchar; - psig->syscr += p->syscr + sig->syscr; - psig->syscw += p->syscw + sig->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - psig->ioac.read_bytes += - p->ioac.read_bytes + sig->ioac.read_bytes; - psig->ioac.write_bytes += - p->ioac.write_bytes + sig->ioac.write_bytes; - psig->ioac.cancelled_write_bytes += - p->ioac.cancelled_write_bytes + - sig->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 5e050c1317c4..8214ba7c8bb1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; -#ifdef CONFIG_TASK_XACCT - sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; -#endif -#ifdef CONFIG_TASK_IO_ACCOUNTING - memset(&sig->ioac, 0, sizeof(sig->ioac)); -#endif + task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->last_switch_timestamp = 0; #endif -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ -#endif - task_io_accounting_init(p); + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 3da47ccdc5e5..f9cd2561689c 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->rchar; - stats->write_char = p->wchar; - stats->read_syscalls = p->syscr; - stats->write_syscalls = p->syscw; + stats->read_char = p->ioac.chr.rchar; + stats->write_char = p->ioac.chr.wchar; + stats->read_syscalls = p->ioac.chr.syscr; + stats->write_syscalls = p->ioac.chr.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; + stats->read_bytes = p->ioac.blk.read_bytes; + stats->write_bytes = p->ioac.blk.write_bytes; + stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes; #else stats->read_bytes = 0; stats->write_bytes = 0; -- cgit From 5c7edcd7ee6b77b88252fe4096dce1a46a60c829 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 31 Jul 2008 02:04:09 -0700 Subject: tracehook: fix exit_signal=0 case My commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664 introduced a regression (sorry about that) for the odd case of exit_signal=0 (e.g. clone_flags=0). This is not a normal use, but it's used by a case in the glibc test suite. Dying with exit_signal=0 sends no signal, but it's supposed to wake up a parent's blocked wait*() calls (unlike the delayed_group_leader case). This fixes tracehook_notify_death() and its caller to distinguish a "signal 0" wakeup from the delayed_group_leader case (with no wakeup). Signed-off-by: Roland McGrath Tested-by: Serge Hallyn Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 21 +++++++++++++-------- kernel/exit.c | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b1875582c1a1..12532839f508 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -493,16 +493,21 @@ static inline int tracehook_notify_jctl(int notify, int why) * @death_cookie: value to pass to tracehook_report_death() * @group_dead: nonzero if this was the last thread in the group to die * - * Return the signal number to send our parent with do_notify_parent(), or - * zero to send no signal and leave a zombie, or -1 to self-reap right now. + * A return value >= 0 means call do_notify_parent() with that signal + * number. Negative return value can be %DEATH_REAP to self-reap right + * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our + * parent. Note that a return value of 0 means a do_notify_parent() call + * that sends no signal, but still wakes up a parent blocked in wait*(). * * Called with write_lock_irq(&tasklist_lock) held. */ +#define DEATH_REAP -1 +#define DEATH_DELAYED_GROUP_LEADER -2 static inline int tracehook_notify_death(struct task_struct *task, void **death_cookie, int group_dead) { if (task->exit_signal == -1) - return task->ptrace ? SIGCHLD : -1; + return task->ptrace ? SIGCHLD : DEATH_REAP; /* * If something other than our normal parent is ptracing us, then @@ -512,21 +517,21 @@ static inline int tracehook_notify_death(struct task_struct *task, if (thread_group_empty(task) && !ptrace_reparented(task)) return task->exit_signal; - return task->ptrace ? SIGCHLD : 0; + return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER; } /** * tracehook_report_death - task is dead and ready to be reaped * @task: @current task now exiting - * @signal: signal number sent to parent, or 0 or -1 + * @signal: return value from tracheook_notify_death() * @death_cookie: value passed back from tracehook_notify_death() * @group_dead: nonzero if this was the last thread in the group to die * * Thread has just become a zombie or is about to self-reap. If positive, * @signal is the signal number just sent to the parent (usually %SIGCHLD). - * If @signal is -1, this thread will self-reap. If @signal is 0, this is - * a delayed_group_leader() zombie. The @death_cookie was passed back by - * tracehook_notify_death(). + * If @signal is %DEATH_REAP, this thread will self-reap. If @signal is + * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie. + * The @death_cookie was passed back by tracehook_notify_death(). * * If normal reaping is not inhibited, @task->exit_state might be changing * in parallel. diff --git a/kernel/exit.c b/kernel/exit.c index eb4d6470d1d0..38ec40630149 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_signal = SIGCHLD; signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal > 0) + if (signal >= 0) signal = do_notify_parent(tsk, signal); - tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tracehook_report_death(tsk, signal, cookie, group_dead); /* If the process is dead, release it - nobody will wait for it */ - if (signal < 0) + if (signal == DEATH_REAP) release_task(tsk); } -- cgit