From a585042f7b933539a0b6bc63650c2d49ffb2e55d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:23 -0800 Subject: freezer: remove racy clear_freeze_flag() and set PF_NOFREEZE on dead tasks clear_freeze_flag() in exit_mm() is racy. Freezing can start afterwards. Remove it. Skipping freezer for exiting task will be properly implemented later. Also, freezable() was testing exit_state directly to make system freezer ignore dead tasks. Let the exiting task set PF_NOFREEZE after entering TASK_DEAD instead. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/exit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..95a4141d07e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -679,8 +679,6 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); - /* We don't want this task to be frozen prematurely */ - clear_freeze_flag(tsk); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -1040,6 +1038,7 @@ NORET_TYPE void do_exit(long code) exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; + tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ schedule(); BUG(); /* Avoid "noreturn function does return". */ -- cgit v1.2.3 From 648616343cdbe904c585a6c12e323d3b3c72e46f Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 15 Dec 2011 14:56:09 +0100 Subject: [S390] cputime: add sparse checking and cleanup Make cputime_t and cputime64_t nocast to enable sparse checking to detect incorrect use of cputime. Drop the cputime macros for simple scalar operations. The conversion macros are still needed. Signed-off-by: Martin Schwidefsky --- kernel/exit.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..5e0d1f4c696e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime += tsk->utime; + sig->stime += tsk->stime; + sig->gtime += tsk->gtime; sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; - psig->cutime = - cputime_add(psig->cutime, - cputime_add(tgutime, - sig->cutime)); - psig->cstime = - cputime_add(psig->cstime, - cputime_add(tgstime, - sig->cstime)); - psig->cgtime = - cputime_add(psig->cgtime, - cputime_add(p->gtime, - cputime_add(sig->gtime, - sig->cgtime))); + psig->cutime += tgutime + sig->cutime; + psig->cstime += tgstime + sig->cstime; + psig->cgtime += p->gtime + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += -- cgit v1.2.3 From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 5 Apr 2011 13:21:19 -0600 Subject: writeback: charge leaked page dirties to active tasks It's a years long problem that a large number of short-lived dirtiers (eg. gcc instances in a fast kernel build) may starve long-run dirtiers (eg. dd) as well as pushing the dirty pages to the global hard limit. The solution is to charge the pages dirtied by the exited gcc to the other random dirtying tasks. It sounds not perfect, however should behave good enough in practice, seeing as that throttled tasks aren't actually running so those that are running are more likely to pick it up and get throttled, therefore promoting an equal spread. Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c Acked-by: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Randy Dunlap Signed-off-by: Wu Fengguang --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..d4aac24cc469 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code) validate_creds_for_do_exit(tsk); preempt_disable(); + if (tsk->nr_dirtied) + __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; -- cgit v1.2.3 From 50b8d257486a45cba7b65ca978986ed216bbcc10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jan 2012 17:29:02 +0100 Subject: ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race Test-case: int main(void) { int pid, status; pid = fork(); if (!pid) { for (;;) { if (!fork()) return 0; if (waitpid(-1, &status, 0) < 0) { printf("ERR!! wait: %m\n"); return 0; } } } assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0); assert(waitpid(-1, NULL, 0) == pid); assert(ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_TRACEFORK) == 0); do { ptrace(PTRACE_CONT, pid, 0, 0); pid = waitpid(-1, NULL, 0); } while (pid > 0); return 1; } It fails because ->real_parent sees its child in EXIT_DEAD state while the tracer is going to change the state back to EXIT_ZOMBIE in wait_task_zombie(). The offending commit is 823b018e which moved the EXIT_DEAD check, but in fact we should not blame it. The original code was not correct as well because it didn't take ptrace_reparented() into account and because we can't really trust ->ptrace. This patch adds the additional check to close this particular race but it doesn't solve the whole problem. We simply can't rely on ->ptrace in this case, it can be cleared if the tracer is multithreaded by the exiting ->parent. I think we should kill EXIT_DEAD altogether, we should always remove the soon-to-be-reaped child from ->children or at least we should never do the DEAD->ZOMBIE transition. But this is too complex for 3.2. Reported-and-tested-by: Denys Vlasenko Tested-by: Lukasz Michalik Acked-by: Tejun Heo Cc: [3.0+] Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..e6e01b959a0e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1540,8 +1540,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (p->exit_state == EXIT_DEAD) + if (unlikely(p->exit_state == EXIT_DEAD)) { + /* + * But do not ignore this task until the tracer does + * wait_task_zombie()->do_notify_parent(). + */ + if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + wo->notask_error = 0; return 0; + } /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { -- cgit v1.2.3 From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 12 Jan 2012 17:17:17 -0800 Subject: treewide: remove useless NORET_TYPE macro and uses It's a very old and now unused prototype marking so just delete it. Neaten panic pointer argument style to keep checkpatch quiet. Signed-off-by: Joe Perches Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Acked-by: Geert Uytterhoeven Acked-by: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -887,7 +887,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); @@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -NORET_TYPE void +void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; -- cgit v1.2.3 From a4ff8dba7d8ce5ceb43fb27df66292251cc73bdc Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:07 -0500 Subject: audit: inline audit_free to simplify the look of generic code make the conditional a static inline instead of doing it in generic code. Signed-off-by: Eric Paris --- kernel/exit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 94ed6e20bb53..88dcbbc446f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -964,8 +964,7 @@ NORET_TYPE void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); -- cgit v1.2.3 From b5740f4b2cb3503b436925eb2242bc3d75cd3dfe Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 17 Jan 2012 17:40:31 +0900 Subject: sched: Fix ancient race in do_exit() try_to_wake_up() has a problem which may change status from TASK_DEAD to TASK_RUNNING in race condition with SMI or guest environment of virtual machine. As a result, exited task is scheduled() again and panic occurs. Here is the sequence how it occurs: ----------------------------------+----------------------------- | CPU A | CPU B ----------------------------------+----------------------------- TASK A calls exit().... do_exit() exit_mm() down_read(mm->mmap_sem); rwsem_down_failed_common() set TASK_UNINTERRUPTIBLE set waiter.task <= task A list_add to sem->wait_list : raw_spin_unlock_irq() (I/O interruption occured) __rwsem_do_wake(mmap_sem) list_del(&waiter->list); waiter->task = NULL wake_up_process(task A) try_to_wake_up() (task is still TASK_UNINTERRUPTIBLE) p->on_rq is still 1.) ttwu_do_wakeup() (*A) : (I/O interruption handler finished) if (!waiter.task) schedule() is not called due to waiter.task is NULL. tsk->state = TASK_RUNNING : check_preempt_curr(); : task->state = TASK_DEAD (*B) <--- set TASK_RUNNING (*C) schedule() (exit task is running again) BUG_ON() is called! -------------------------------------------------------- The execution time between (*A) and (*B) is usually very short, because the interruption is disabled, and setting TASK_RUNNING at (*C) must be executed before setting TASK_DEAD. HOWEVER, if SMI is interrupted between (*A) and (*B), (*C) is able to execute AFTER setting TASK_DEAD! Then, exited task is scheduled again, and BUG_ON() is called.... If the system works on guest system of virtual machine, the time between (*A) and (*B) may be also long due to scheduling of hypervisor, and same phenomenon can occur. By this patch, do_exit() waits for releasing task->pi_lock which is used in try_to_wake_up(). It guarantees the task becomes TASK_DEAD after waking up. Signed-off-by: Yasunori Goto Acked-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20120117174031.3118.E1E9C6FF@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170d..4b4042f9bc6a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1038,6 +1038,22 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ -- cgit v1.2.3