From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 13:49:45 +0100
Subject: perfcounters: implement "counter inheritance"

Impact: implement new performance feature

Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.

Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).

The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.

The timec.c command utilizes counter inheritance:

  http://redhat.com/~mingo/perfcounters/timec.c

Sample output:

   $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

   Performance counter stats for 'ls':

           163516953 instructions
                2295 cache-misses
             2855182 branch-misses

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..d336c90a5f13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-#ifdef CONFIG_FUTEX
 	/*
-	 * This must happen late, after the PID is not
-	 * hashed anymore:
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
 	 */
+	perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
 	if (unlikely(current->pi_state_cache))
-- 
cgit v1.2.3


From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:10:57 +0100
Subject: perfcounters: fix task clock counter

Impact: fix per task clock counter precision

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index d336c90a5f13..244edfd96865 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	/*
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
+	 */
+	perf_counter_exit_task(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-- 
cgit v1.2.3


From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 10:20:42 +0100
Subject: perfcounters: pull inherited counters

Change counter inheritance from a 'push' to a 'pull' model: instead of
child tasks pushing their final counts to the parent, reuse the wait4
infrastructure to pull counters as child tasks are exit-processed,
much like how cutime/cstime is collected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 244edfd96865..101b7eeff44c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+#ifdef CONFIG_PERF_COUNTERS
+	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
@@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
-
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
@@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-- 
cgit v1.2.3


From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 09:36:00 -0400
Subject: tracing: create automated trace defines

This patch lowers the number of places a developer must modify to add
new tracepoints. The current method to add a new tracepoint
into an existing system is to write the trace point macro in the
trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or
DECLARE_TRACE, then they must add the same named item into the C file
with the macro DEFINE_TRACE(name) and then add the trace point.

This change cuts out the needing to add the DEFINE_TRACE(name).
Every file that uses the tracepoint must still include the trace/<type>.h
file, but the one C file must also add a define before the including
of that file.

 #define CREATE_TRACE_POINTS
 #include <trace/mytrace.h>

This will cause the trace/mytrace.h file to also produce the C code
necessary to implement the trace point.

Note, if more than one trace/<type>.h is used to create the C code
it is best to list them all together.

 #define CREATE_TRACE_POINTS
 #include <trace/foo.h>
 #include <trace/bar.h>
 #include <trace/fido.h>

Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with
the cleaner solution of the define above the includes over my first
design to have the C code include a "special" header.

This patch converts sched, irq and lockdep and skb to use this new
method.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/exit.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..2fe9d2c7eeee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,10 +56,6 @@
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
-- 
cgit v1.2.3


From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 19:39:12 -0400
Subject: tracing/events: move trace point headers into include/trace/events

Impact: clean up

Create a sub directory in include/trace called events to keep the
trace point headers in their own separate directory. Only headers that
declare trace points should be defined in this directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2fe9d2c7eeee..cab535c427b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3


From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 29 Apr 2009 18:01:23 +0200
Subject: do_wait: do take security_task_wait() into account

I was never able to understand what should we actually do when
security_task_wait() fails, but the current code doesn't look right.

If ->task_wait() returns the error, we update *notask_error correctly.
But then we either reap the child (despite the fact this was forbidden)
or clear *notask_error (and hide the securiy policy problems).

This patch assumes that "stolen by ptrace" doesn't matter. If selinux
denies the child we should ignore it but make sure we report -EACCESS
instead of -ECHLD if there are no other eligible children.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/exit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..d2e8239ea187 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 */
 		if (*notask_error)
 			*notask_error = ret;
+		return 0;
 	}
 
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
-- 
cgit v1.2.3


From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 20:45:59 +0200
Subject: perf_counter: Fix counter inheritance

Srivatsa Vaddagiri reported that a Java workload triggers this
warning in kernel/exit.c:

   WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));

Add the inherited counter propagation on self-detach, this could
cause counter leaks and incomplete stats in threaded code like
the below:

  #include <pthread.h>
  #include <unistd.h>

  void *thread(void *arg)
  {
          sleep(5);
          return NULL;
  }

  void main(void)
  {
          pthread_t thr;
          pthread_create(&thr, NULL, thread, NULL);
  }

Reported-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4741376c8dec..16d74f13a3e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	__unhash_process(tsk);
 
 	/*
-- 
cgit v1.2.3


From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: perf_counter: fix threaded task exit

Flushing counters in __exit_signal() with irqs disabled is not
a good idea as perf_counter_exit_task() acquires mutexes. So
flush it before acquiring the tasklist lock.

(Note, we still need a fix for when the PID has been unhashed.)

[ Impact: fix crash with inherited counters ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 16d74f13a3e7..73affd35e76d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	__unhash_process(tsk);
 
 	/*
@@ -183,6 +177,13 @@ repeat:
 	atomic_dec(&__task_cred(p)->user->processes);
 
 	proc_flush_task(p);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
-- 
cgit v1.2.3


From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:08:41 +0200
Subject: perf_counter: fix counter freeing logic

Fix counter lifetime bugs which explain the crashes reported by
Marcelo Tosatti and Arnaldo Carvalho de Melo.

The new rule is: flushing + freeing is only done for a task's
own counters, never for other tasks.

[ Impact: fix crashes/lockups with inherited counters ]

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 73affd35e76d..f9dfedd94af0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,12 +178,6 @@ repeat:
 
 	proc_flush_task(p);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
@@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	mpol_put(tsk->mempolicy);
@@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-- 
cgit v1.2.3


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index f9dfedd94af0..99ad4063ee4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
+#include <linux/perf_counter.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
@@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 #ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
-- 
cgit v1.2.3


From 87245135d5057edd5a8037131f81eeffd76d4fef Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:23 -0700
Subject: allow_signal: kill the bogus ->mm check, add a note about
 CLONE_SIGHAND

allow_signal() checks ->mm == NULL.  Not sure why.  Perhaps to make sure
current is the kernel thread.  But this helper must not be used unless we
are the kernel thread, kill this check.

Also, document the fact that the CLONE_SIGHAND kthread must not use
allow_signal(), unless the caller really wants to change the parent's
->sighand->action as well.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index b6c90b5ef509..533e5f85669a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -375,9 +375,8 @@ static void set_special_pids(struct pid *pid)
 }
 
 /*
- * Let kernel threads use this to say that they
- * allow a certain signal (since daemonize() will
- * have disabled all of them by default).
+ * Let kernel threads use this to say that they allow a certain signal.
+ * Must not be used if kthread was cloned with CLONE_SIGHAND.
  */
 int allow_signal(int sig)
 {
@@ -385,14 +384,14 @@ int allow_signal(int sig)
 		return -EINVAL;
 
 	spin_lock_irq(&current->sighand->siglock);
+	/* This is only needed for daemonize()'ed kthreads */
 	sigdelset(&current->blocked, sig);
-	if (!current->mm) {
-		/* Kernel threads handle their own signals.
-		   Let the signal code know it'll be handled, so
-		   that they don't get converted to SIGKILL or
-		   just silently dropped */
-		current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
-	}
+	/*
+	 * Kernel threads handle their own signals. Let the signal code
+	 * know it'll be handled, so that they don't get converted to
+	 * SIGKILL or just silently dropped.
+	 */
+	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
-- 
cgit v1.2.3


From dea33cfd99022d82d923a0c6a3bd895fb6683fb2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:29 -0700
Subject: ptrace: mm_need_new_owner: use ->real_parent to search in the
 siblings

"Search in the siblings" should use ->real_parent, not ->parent.  If the
task is traced then ->parent == tracer, while the task's parent is always
->real_parent.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 533e5f85669a..213f906f5e16 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -590,7 +590,7 @@ retry:
 	/*
 	 * Search in the siblings
 	 */
-	list_for_each_entry(c, &p->parent->children, sibling) {
+	list_for_each_entry(c, &p->real_parent->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
-- 
cgit v1.2.3


From 5cb11446892833e50970fb2277a9f7563b0a8bd3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:30 -0700
Subject: ptrace: do not use task->ptrace directly in core kernel

No functional changes.

- Nobody except ptrace.c & co should use ptrace flags directly, we have
  task_ptrace() for that.

- No need to specially check PT_PTRACED, we must not have other PT_ bits
  set without PT_PTRACED. And no need to know this flag exists.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 213f906f5e16..938cceebb9ad 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -757,7 +757,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
 	p->exit_signal = SIGCHLD;
 
 	/* If it has exited notify the new parent about this child's death. */
-	if (!p->ptrace &&
+	if (!task_ptrace(p) &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
 		if (task_detached(p)) {
@@ -782,7 +782,7 @@ static void forget_original_parent(struct task_struct *father)
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
 		if (p->parent == father) {
-			BUG_ON(p->ptrace);
+			BUG_ON(task_ptrace(p));
 			p->parent = p->real_parent;
 		}
 		reparent_thread(father, p, &dead_children);
@@ -1482,7 +1482,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(p->ptrace)) {
+	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
-- 
cgit v1.2.3


From d1e98f429aa10132b3010ba3b0be47552a2eb14b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:34 -0700
Subject: ptrace: wait_task_zombie: s/->parent/->real_parent/

Change wait_task_zombie() to use ->real_parent instead of ->parent.  We
could even use current afaics, but ->real_parent is more clean.

We know that the child is not ptrace_reparented() and thus they are equal.
 But we should avoid using task_struct->parent, we are going to remove it.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 938cceebb9ad..826e1dc8168b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1204,7 +1204,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * p->signal fields, because they are only touched by
 		 * __exit_signal, which runs with tasklist_lock
 		 * write-locked anyway, and so is excluded here.  We do
-		 * need to protect the access to p->parent->signal fields,
+		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
 		 *
@@ -1213,8 +1213,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * group including the group leader.
 		 */
 		thread_group_cputime(p, &cputime);
-		spin_lock_irq(&p->parent->sighand->siglock);
-		psig = p->parent->signal;
+		spin_lock_irq(&p->real_parent->sighand->siglock);
+		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
@@ -1245,7 +1245,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 			sig->oublock + sig->coublock;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
-		spin_unlock_irq(&p->parent->sighand->siglock);
+		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 
 	/*
-- 
cgit v1.2.3


From 77d1ef79568b337f599b75795acc8f78a87ba9ba Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:36 -0700
Subject: wait_task_zombie: do not use thread_group_cputime()

There is no reason for thread_group_cputime() in wait_task_zombie(), there
must be no other threads.

This call was previously needed to collect the per-cpu data which we do
not have any longer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Vitaly Mayatskikh <vmayatsk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 826e1dc8168b..94a9992e6fd9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1191,7 +1191,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (likely(!traced)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
-		struct task_cputime cputime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1207,23 +1206,20 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
-		 *
-		 * We use thread_group_cputime() to get times for the thread
-		 * group, which consolidates times for all threads in the
-		 * group including the group leader.
 		 */
-		thread_group_cputime(p, &cputime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(cputime.utime,
-				    sig->cutime));
+			cputime_add(p->utime,
+			cputime_add(sig->utime,
+				    sig->cutime)));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(cputime.stime,
-				    sig->cstime));
+			cputime_add(p->stime,
+			cputime_add(sig->stime,
+				    sig->cstime)));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
-- 
cgit v1.2.3


From 47918025efdabd34e96b13b26eb2cf2fd6fd1f7c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:39 -0700
Subject: shift "ptrace implies WUNTRACED" from ptrace_do_wait() to
 wait_task_stopped()

No functional changes, preparation for the next patch.

ptrace_do_wait() adds WUNTRACED to options for wait_task_stopped() which
should always accept the stopped tracee, even if do_wait() was called
without WUNTRACED.

Change wait_task_stopped() to check "ptrace || WUNTRACED" instead.  This
makes the code more explicit, and "int options" argument becomes const in
do_wait() pathes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 94a9992e6fd9..fd781b56401d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1330,7 +1330,10 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
-	if (!(options & WUNTRACED))
+	/*
+	 * Traditionally we see ptrace'd stopped tasks regardless of options.
+	 */
+	if (!ptrace && !(options & WUNTRACED))
 		return 0;
 
 	exit_code = 0;
@@ -1548,11 +1551,6 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
 {
 	struct task_struct *p;
 
-	/*
-	 * Traditionally we see ptrace'd stopped tasks regardless of options.
-	 */
-	options |= WUNTRACED;
-
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
 		int ret = wait_consider_task(tsk, 1, p, notask_error,
 					     type, pid, options,
-- 
cgit v1.2.3


From 9e8ae01d1c86dcaa6443c897662545d088036e4c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:39 -0700
Subject: introduce "struct wait_opts" to simplify do_wait() patches

Introduce "struct wait_opts" which holds the parameters for misc helpers
in do_wait() pathes.

This adds 13 lines to kernel/exit.c, but saves 256 bytes from .o and imho
makes the code much more readable.

This patch temporary uglifies rusage/siginfo code a little bit, will be
addressed by further cleanups.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 207 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 110 insertions(+), 97 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index fd781b56401d..29622e468b7f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1080,6 +1080,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 	return 0;
 }
 
+struct wait_opts {
+	enum pid_type		wo_type;
+	struct pid		*wo_pid;
+	int			wo_flags;
+
+	struct siginfo __user	*wo_info;
+	int __user		*wo_stat;
+	struct rusage __user	*wo_rusage;
+
+	int			notask_error;
+};
+
 static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid = NULL;
@@ -1090,13 +1102,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 	return pid;
 }
 
-static int eligible_child(enum pid_type type, struct pid *pid, int options,
-			  struct task_struct *p)
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	int err;
 
-	if (type < PIDTYPE_MAX) {
-		if (task_pid_type(p, type) != pid)
+	if (wo->wo_type < PIDTYPE_MAX) {
+		if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
 			return 0;
 	}
 
@@ -1105,8 +1116,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
 	 * A "clone" child here is one that reports to its parent
 	 * using a signal other than SIGCHLD.) */
-	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
-	    && !(options & __WALL))
+	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+	    && !(wo->wo_flags & __WALL))
 		return 0;
 
 	err = security_task_wait(p);
@@ -1116,14 +1127,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 	return 1;
 }
 
-static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
-			       int why, int status,
-			       struct siginfo __user *infop,
-			       struct rusage __user *rusagep)
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
+				pid_t pid, uid_t uid, int why, int status)
 {
-	int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+	struct siginfo __user *infop;
+	int retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 
 	put_task_struct(p);
+	infop = wo->wo_info;
 	if (!retval)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval)
@@ -1147,19 +1159,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int options,
-			    struct siginfo __user *infop,
-			    int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = __task_cred(p)->uid;
+	struct siginfo __user *infop;
 
-	if (!likely(options & WEXITED))
+	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 
-	if (unlikely(options & WNOWAIT)) {
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1172,8 +1183,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status = exit_code & 0x7f;
 		}
-		return wait_noreap_copyout(p, pid, uid, why,
-					   status, infop, ru);
+		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
 
 	/*
@@ -1250,11 +1260,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-	if (!retval && stat_addr)
-		retval = put_user(status, stat_addr);
+	if (!retval && wo->wo_stat)
+		retval = put_user(status, wo->wo_stat);
+
+	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
@@ -1322,10 +1335,10 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(int ptrace, struct task_struct *p,
-			     int options, struct siginfo __user *infop,
-			     int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_stopped(struct wait_opts *wo,
+				int ptrace, struct task_struct *p)
 {
+	struct siginfo __user *infop;
 	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
@@ -1333,7 +1346,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	/*
 	 * Traditionally we see ptrace'd stopped tasks regardless of options.
 	 */
-	if (!ptrace && !(options & WUNTRACED))
+	if (!ptrace && !(wo->wo_flags & WUNTRACED))
 		return 0;
 
 	exit_code = 0;
@@ -1347,7 +1360,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!exit_code)
 		goto unlock_sig;
 
-	if (!unlikely(options & WNOWAIT))
+	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 
 	/* don't need the RCU readlock here as we're holding a spinlock */
@@ -1369,14 +1382,15 @@ unlock_sig:
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(options & WNOWAIT))
-		return wait_noreap_copyout(p, pid, uid,
-					   why, exit_code,
-					   infop, ru);
+	if (unlikely(wo->wo_flags & WNOWAIT))
+		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
+
+	retval = wo->wo_rusage
+		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+	if (!retval && wo->wo_stat)
+		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
 
-	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
-	if (!retval && stat_addr)
-		retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
@@ -1403,15 +1417,13 @@ unlock_sig:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(struct task_struct *p, int options,
-			       struct siginfo __user *infop,
-			       int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 {
 	int retval;
 	pid_t pid;
 	uid_t uid;
 
-	if (!unlikely(options & WCONTINUED))
+	if (!unlikely(wo->wo_flags & WCONTINUED))
 		return 0;
 
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1423,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int options,
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
-	if (!unlikely(options & WNOWAIT))
+	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	uid = __task_cred(p)->uid;
 	spin_unlock_irq(&p->sighand->siglock);
@@ -1432,17 +1444,17 @@ static int wait_task_continued(struct task_struct *p, int options,
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
-	if (!infop) {
-		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+	if (!wo->wo_info) {
+		retval = wo->wo_rusage
+			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 		put_task_struct(p);
-		if (!retval && stat_addr)
-			retval = put_user(0xffff, stat_addr);
+		if (!retval && wo->wo_stat)
+			retval = put_user(0xffff, wo->wo_stat);
 		if (!retval)
 			retval = pid;
 	} else {
-		retval = wait_noreap_copyout(p, pid, uid,
-					     CLD_CONTINUED, SIGCONT,
-					     infop, ru);
+		retval = wait_noreap_copyout(wo, p, pid, uid,
+					     CLD_CONTINUED, SIGCONT);
 		BUG_ON(retval == 0);
 	}
 
@@ -1452,19 +1464,16 @@ static int wait_task_continued(struct task_struct *p, int options,
 /*
  * Consider @p for a wait by @parent.
  *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child,
+ * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int wait_consider_task(struct task_struct *parent, int ptrace,
-			      struct task_struct *p, int *notask_error,
-			      enum pid_type type, struct pid *pid, int options,
-			      struct siginfo __user *infop,
-			      int __user *stat_addr, struct rusage __user *ru)
+static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
+				int ptrace, struct task_struct *p)
 {
-	int ret = eligible_child(type, pid, options, p);
+	int ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 
@@ -1476,8 +1485,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 * to look for security policy problems, rather
 		 * than for mysterious wait bugs.
 		 */
-		if (*notask_error)
-			*notask_error = ret;
+		if (wo->notask_error)
+			wo->notask_error = ret;
 		return 0;
 	}
 
@@ -1486,7 +1495,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
 		 */
-		*notask_error = 0;
+		wo->notask_error = 0;
 		return 0;
 	}
 
@@ -1497,34 +1506,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 	 * We don't reap group leaders with subthreads.
 	 */
 	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
-		return wait_task_zombie(p, options, infop, stat_addr, ru);
+		return wait_task_zombie(wo, p);
 
 	/*
 	 * It's stopped or running now, so it might
 	 * later continue, exit, or stop again.
 	 */
-	*notask_error = 0;
+	wo->notask_error = 0;
 
 	if (task_stopped_code(p, ptrace))
-		return wait_task_stopped(ptrace, p, options,
-					 infop, stat_addr, ru);
+		return wait_task_stopped(wo, ptrace, p);
 
-	return wait_task_continued(p, options, infop, stat_addr, ru);
+	return wait_task_continued(wo, p);
 }
 
 /*
  * Do the work of do_wait() for one thread in the group, @tsk.
  *
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children,
+ * ->notask_error is 0 if there were any eligible children,
  * or another error from security_task_wait(), or still -ECHILD.
  */
-static int do_wait_thread(struct task_struct *tsk, int *notask_error,
-			  enum pid_type type, struct pid *pid, int options,
-			  struct siginfo __user *infop, int __user *stat_addr,
-			  struct rusage __user *ru)
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 
@@ -1533,9 +1538,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 		 * Do not consider detached threads.
 		 */
 		if (!task_detached(p)) {
-			int ret = wait_consider_task(tsk, 0, p, notask_error,
-						     type, pid, options,
-						     infop, stat_addr, ru);
+			int ret = wait_consider_task(wo, tsk, 0, p);
 			if (ret)
 				return ret;
 		}
@@ -1544,17 +1547,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 	return 0;
 }
 
-static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
-			  enum pid_type type, struct pid *pid, int options,
-			  struct siginfo __user *infop, int __user *stat_addr,
-			  struct rusage __user *ru)
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(tsk, 1, p, notask_error,
-					     type, pid, options,
-					     infop, stat_addr, ru);
+		int ret = wait_consider_task(wo, tsk, 1, p);
 		if (ret)
 			return ret;
 	}
@@ -1562,38 +1560,36 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
 	return 0;
 }
 
-static long do_wait(enum pid_type type, struct pid *pid, int options,
-		    struct siginfo __user *infop, int __user *stat_addr,
-		    struct rusage __user *ru)
+static long do_wait(struct wait_opts *wo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
 	int retval;
 
-	trace_sched_process_wait(pid);
+	trace_sched_process_wait(wo->wo_pid);
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
 	/*
 	 * If there is nothing that can match our critiera just get out.
-	 * We will clear @retval to zero if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
+	 * We will clear ->notask_error to zero if we see any child that
+	 * might later match our criteria, even if we are not able to reap
+	 * it yet.
 	 */
-	retval = -ECHILD;
-	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+	retval = wo->notask_error = -ECHILD;
+	if ((wo->wo_type < PIDTYPE_MAX) &&
+	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
 		goto end;
 
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		int tsk_result = do_wait_thread(tsk, &retval,
-						type, pid, options,
-						infop, stat_addr, ru);
+		int tsk_result = do_wait_thread(wo, tsk);
+
 		if (!tsk_result)
-			tsk_result = ptrace_do_wait(tsk, &retval,
-						    type, pid, options,
-						    infop, stat_addr, ru);
+			tsk_result = ptrace_do_wait(wo, tsk);
+
 		if (tsk_result) {
 			/*
 			 * tasklist_lock is unlocked and we have a final result.
@@ -1602,25 +1598,27 @@ repeat:
 			goto end;
 		}
 
-		if (options & __WNOTHREAD)
+		if (wo->wo_flags & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
 		BUG_ON(tsk->signal != current->signal);
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
-	if (!retval && !(options & WNOHANG)) {
+	retval = wo->notask_error;
+	if (!retval && !(wo->wo_flags & WNOHANG)) {
 		retval = -ERESTARTSYS;
 		if (!signal_pending(current)) {
 			schedule();
 			goto repeat;
 		}
 	}
-
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-	if (infop) {
+	if (wo->wo_info) {
+		struct siginfo __user *infop = wo->wo_info;
+
 		if (retval > 0)
 			retval = 0;
 		else {
@@ -1649,6 +1647,7 @@ end:
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
+	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
@@ -1678,7 +1677,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 
 	if (type < PIDTYPE_MAX)
 		pid = find_get_pid(upid);
-	ret = do_wait(type, pid, options, infop, NULL, ru);
+
+	wo.wo_type	= type;
+	wo.wo_pid	= pid;
+	wo.wo_flags	= options;
+	wo.wo_info	= infop;
+	wo.wo_stat	= NULL;
+	wo.wo_rusage	= ru;
+	ret = do_wait(&wo);
 	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
@@ -1689,6 +1695,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
+	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
@@ -1710,7 +1717,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		pid = find_get_pid(upid);
 	}
 
-	ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+	wo.wo_type	= type;
+	wo.wo_pid	= pid;
+	wo.wo_flags	= options | WEXITED;
+	wo.wo_info	= NULL;
+	wo.wo_stat	= stat_addr;
+	wo.wo_rusage	= ru;
+	ret = do_wait(&wo);
 	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
-- 
cgit v1.2.3


From 64a16caf5e3417ee32f670debcb5857b02a9e08e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:40 -0700
Subject: do_wait: simplify retval/tsk_result/notask_error mess

Now that we don't pass &retval down to other helpers we can simplify
the code more.

- kill tsk_result, just use retval

- add the "notask" label right after the main loop, and
  s/got end/goto notask/ after the fastpath pid check.

  This way we don't need to initialize retval before this
  check and the code becomes a bit more clean, if this pid
  has no attached tasks we should just skip the list search.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 29622e468b7f..9c6881a0a8b4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1576,27 +1576,22 @@ repeat:
 	 * might later match our criteria, even if we are not able to reap
 	 * it yet.
 	 */
-	retval = wo->notask_error = -ECHILD;
+	wo->notask_error = -ECHILD;
 	if ((wo->wo_type < PIDTYPE_MAX) &&
 	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
-		goto end;
+		goto notask;
 
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		int tsk_result = do_wait_thread(wo, tsk);
-
-		if (!tsk_result)
-			tsk_result = ptrace_do_wait(wo, tsk);
+		retval = do_wait_thread(wo, tsk);
+		if (retval)
+			goto end;
 
-		if (tsk_result) {
-			/*
-			 * tasklist_lock is unlocked and we have a final result.
-			 */
-			retval = tsk_result;
+		retval = ptrace_do_wait(wo, tsk);
+		if (retval)
 			goto end;
-		}
 
 		if (wo->wo_flags & __WNOTHREAD)
 			break;
@@ -1605,6 +1600,7 @@ repeat:
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
+notask:
 	retval = wo->notask_error;
 	if (!retval && !(wo->wo_flags & WNOHANG)) {
 		retval = -ERESTARTSYS;
-- 
cgit v1.2.3


From a3f6dfb7295facb0505b5beca5a7ce48b0612379 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:41 -0700
Subject: do_wait: kill the old BUG_ON, use while_each_thread()

do_wait() does BUG_ON(tsk->signal != current->signal), this looks like a
raher obsolete check.  At least, I don't think do_wait() is the best place
to verify that all threads have the same ->signal.  Remove it.

Also, change the code to use while_each_thread().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9c6881a0a8b4..dd83c8419101 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1595,9 +1595,7 @@ repeat:
 
 		if (wo->wo_flags & __WNOTHREAD)
 			break;
-		tsk = next_thread(tsk);
-		BUG_ON(tsk->signal != current->signal);
-	} while (tsk != current);
+	} while_each_thread(current, tsk);
 	read_unlock(&tasklist_lock);
 
 notask:
-- 
cgit v1.2.3


From f95d39d10fe7d47336e65172f52bf64e0096f983 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 17 Jun 2009 16:27:42 -0700
Subject: do_wait: fix the theoretical race with stop/trace/cont

do_wait:

	current->state = TASK_INTERRUPTIBLE;

	read_lock(&tasklist_lock);
	... search for the task to reap ...

In theory, the ->state changing can leak into the critical section.  Since
the child can change its status under read_lock(tasklist) in parallel
(finish_stop/ptrace_stop), we can miss the wakeup if __wake_up_parent()
sees us in TASK_RUNNING state.  Add the barrier.

Also, use __set_current_state() to set TASK_RUNNING.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index dd83c8419101..7ef355dd3dca 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1581,7 +1581,7 @@ repeat:
 	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
 		goto notask;
 
-	current->state = TASK_INTERRUPTIBLE;
+	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
@@ -1608,7 +1608,7 @@ notask:
 		}
 	}
 end:
-	current->state = TASK_RUNNING;
+	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
 	if (wo->wo_info) {
 		struct siginfo __user *infop = wo->wo_info;
-- 
cgit v1.2.3


From e1eb1ebcca871673c76caf63335c4237680040f1 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 17 Jun 2009 16:27:42 -0700
Subject: mm: exit.c reorder wait_opts to remove padding on 64 bit builds

Reorder struct wait_opts to remove 8 bytes of alignment padding on 64 bit
builds.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7ef355dd3dca..13ae64001fec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1082,8 +1082,8 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 
 struct wait_opts {
 	enum pid_type		wo_type;
-	struct pid		*wo_pid;
 	int			wo_flags;
+	struct pid		*wo_pid;
 
 	struct siginfo __user	*wo_info;
 	int __user		*wo_stat;
-- 
cgit v1.2.3


From befca96779b0259ac8fad0183e748a62935c39cb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 18 Jun 2009 16:49:11 -0700
Subject: ptrace: wait_task_zombie: do not account traced sub-threads

The bug is ancient.

If we trace the sub-thread of our natural child and this sub-thread exits,
we update parent->signal->cxxx fields.  But we should not do this until
the whole thread-group exits, otherwise we account this thread (and all
other live threads) twice.

Add the task_detached() check.  No need to check thread_group_empty(),
wait_consider_task()->delay_group_leader() already did this.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Vitaly Mayatskikh <vmayatsk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 13ae64001fec..628d41f0dd54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1197,8 +1197,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	}
 
 	traced = ptrace_reparented(p);
-
-	if (likely(!traced)) {
+	/*
+	 * It can be ptraced but not reparented, check
+	 * !task_detached() to filter out sub-threads.
+	 */
+	if (likely(!traced) && likely(!task_detached(p))) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 
-- 
cgit v1.2.3