From e5c1902e9260a0075ea52cb5ef627a8d9aaede89 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b601be3dace..85f51042c2b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1260,6 +1260,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ + unsigned int group_stop; /* GROUP_STOP_*, siglock protected */ /* ??? */ unsigned int personality; unsigned did_exec:1; @@ -1771,6 +1772,11 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +/* + * task->group_stop flags + */ +#define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ + #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -- cgit v1.2.3 From 39efa3ef3a376a4e53de2f82fc91182459d34200 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 85f51042c2b8..b2a17dfbdbad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1775,8 +1775,11 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * task->group_stop flags */ +#define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ +extern void task_clear_group_stop_pending(struct task_struct *task); + #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -- cgit v1.2.3 From d79fdd6d96f46fabb779d86332e3677c6f5c2a4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Jan Kratochvil --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b2a17dfbdbad..456d80ed3b78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1775,8 +1775,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * task->group_stop flags */ +#define GROUP_STOP_SIGMASK 0xffff /* signr of the last group stop */ #define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ +#define GROUP_STOP_TRAPPING (1 << 18) /* switching from STOPPED to TRACED */ extern void task_clear_group_stop_pending(struct task_struct *task); -- cgit v1.2.3 From ee77f075921730b2b465880f9fd4367003bdab39 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Apr 2011 20:12:38 +0200 Subject: signal: Turn SIGNAL_STOP_DEQUEUED into GROUP_STOP_DEQUEUED This patch moves SIGNAL_STOP_DEQUEUED from signal_struct->flags to task_struct->group_stop, and thus makes it per-thread. Like SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can be false-positive after return from get_signal_to_deliver(), this is fine. The only purpose of this bit is: we can drop ->siglock after __dequeue_signal() returns the sig_kernel_stop() signal and before we call do_signal_stop(), in this case we must not miss SIGCONT if it comes in between. But, unlike SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can not be false-positive in do_signal_stop() if multiple threads dequeue the sig_kernel_stop() signal at the same time. Consider two threads T1 and T2, SIGTTIN has a hanlder. - T1 dequeues SIGTSTP and sets SIGNAL_STOP_DEQUEUED, then it drops ->siglock - SIGCONT comes and clears SIGNAL_STOP_DEQUEUED, SIGTSTP should be cancelled. - T2 dequeues SIGTTIN and sets SIGNAL_STOP_DEQUEUED again. Since we have a handler we should not stop, T2 returns to usermode to run the handler. - T1 continues, calls do_signal_stop() and wrongly starts the group stop because SIGNAL_STOP_DEQUEUED was restored in between. With or without this change: - we need to do something with ptrace_signal() which can return SIGSTOP, but this needs another discussion - SIGSTOP can be lost if it races with the mt exec, will be fixed later. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- include/linux/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 456d80ed3b78..8cef82d4cf77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,9 +652,8 @@ struct signal_struct { * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ -#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ -#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ -#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ @@ -1779,6 +1778,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ #define GROUP_STOP_TRAPPING (1 << 18) /* switching from STOPPED to TRACED */ +#define GROUP_STOP_DEQUEUED (1 << 19) /* stop signal dequeued */ extern void task_clear_group_stop_pending(struct task_struct *task); -- cgit v1.2.3 From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:50 +0200 Subject: sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c027e92c..020b79d6c486 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void) struct sched_group { struct sched_group *next; /* Must be a circular list */ + atomic_t ref; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@ -973,6 +974,10 @@ struct sched_domain { #ifdef CONFIG_SCHED_DEBUG char *name; #endif + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; unsigned int span_weight; /* -- cgit v1.2.3 From 3859173d43658d51a749bc0201b943922577d39c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:53 +0200 Subject: sched: Reduce some allocation pressure Since we now allocate SD_LV_MAX * nr_cpu_ids sched_domain/sched_group structures when rebuilding the scheduler toplogy it might make sense to shrink that depending on the CONFIG_ options. This is only needed until we get rid of SD_LV_* alltogether and provide a full dynamic topology interface. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.406226449@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 020b79d6c486..5a9168b01db8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -897,12 +897,20 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) enum sched_domain_level { SD_LV_NONE = 0, +#ifdef CONFIG_SCHED_SMT SD_LV_SIBLING, +#endif +#ifdef CONFIG_SCHED_MC SD_LV_MC, +#endif +#ifdef CONFIG_SCHED_BOOK SD_LV_BOOK, +#endif SD_LV_CPU, +#ifdef CONFIG_NUMA SD_LV_NODE, SD_LV_ALLNODES, +#endif SD_LV_MAX }; -- cgit v1.2.3 From 7dd04b730749f957c116f363524fd622b05e5141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:56 +0200 Subject: sched: Remove some dead code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.553814623@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a9168b01db8..09d9e02f2b61 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,9 +883,6 @@ struct sched_group { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_group' in kernel/sched.c) */ unsigned long cpumask[0]; }; @@ -994,9 +991,6 @@ struct sched_domain { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_domain' in kernel/sched.c) */ unsigned long span[0]; }; -- cgit v1.2.3 From 60495e7760d8ee364695006af37309b0755e0e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:04 +0200 Subject: sched: Dynamic sched_domain::level Remove the SD_LV_ enum and use dynamic level assignments. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.969433965@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 09d9e02f2b61..e43e5b0ab0b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -892,25 +892,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } -enum sched_domain_level { - SD_LV_NONE = 0, -#ifdef CONFIG_SCHED_SMT - SD_LV_SIBLING, -#endif -#ifdef CONFIG_SCHED_MC - SD_LV_MC, -#endif -#ifdef CONFIG_SCHED_BOOK - SD_LV_BOOK, -#endif - SD_LV_CPU, -#ifdef CONFIG_NUMA - SD_LV_NODE, - SD_LV_ALLNODES, -#endif - SD_LV_MAX -}; - struct sched_domain_attr { int relax_domain_level; }; @@ -919,6 +900,8 @@ struct sched_domain_attr { .relax_domain_level = -1, \ } +extern int sched_domain_level_max; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -936,7 +919,7 @@ struct sched_domain { unsigned int forkexec_idx; unsigned int smt_gain; int flags; /* See SD_* */ - enum sched_domain_level level; + int level; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ -- cgit v1.2.3 From 184748cc50b2dceb8287f9fb657eda48ff8fcfe7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:39 +0200 Subject: sched: Provide scheduler_ipi() callback in response to smp_send_reschedule() For future rework of try_to_wake_up() we'd like to push part of that function onto the CPU the task is actually going to run on. In order to do so we need a generic callback from the existing scheduler IPI. This patch introduces such a generic callback: scheduler_ipi() and implements it as a NOP. BenH notes: PowerPC might use this IPI on offline CPUs under rare conditions! Acked-by: Russell King Acked-by: Martin Schwidefsky Acked-by: Chris Metcalf Acked-by: Jesper Nilsson Acked-by: Benjamin Herrenschmidt Signed-off-by: Ralf Baechle Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.744338123@chello.nl --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c027e92c..758e27afcda5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2189,8 +2189,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +static inline void scheduler_ipi(void) { } extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.3 From 3ca7a440da394808571dad32d33d3bc0389982e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:40 +0200 Subject: sched: Always provide p->on_cpu Always provide p->on_cpu so that we can determine if its on a cpu without having to lock the rq. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.785452014@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 758e27afcda5..3435837e89ff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1200,9 +1200,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + int on_cpu; #endif int prio, static_prio, normal_prio; -- cgit v1.2.3 From c6eb3dda25892f1f974f5420f63e6721aab02f6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:41 +0200 Subject: mutex: Use p->on_cpu for the adaptive spin Since we now have p->on_cpu unconditionally available, use it to re-implement mutex_spin_on_owner. Requested-by: Thomas Gleixner Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.826338173@chello.nl --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3435837e89ff..173850479e2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -- cgit v1.2.3 From fd2f4419b4cbe8fe90796df9617c355762afd6a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:44 +0200 Subject: sched: Provide p->on_rq Provide a generic p->on_rq because the p->se.on_rq semantics are unfavourable for lockless wakeups but needed for sched_fair. In particular, p->on_rq is only cleared when we actually dequeue the task in schedule() and not on any random dequeue as done by things like __migrate_task() and __sched_setscheduler(). This also allows us to remove p->se usage from !sched_fair code. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.949545047@chello.nl --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 173850479e2c..b33a700652dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,6 +1202,7 @@ struct task_struct { #ifdef CONFIG_SMP int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; -- cgit v1.2.3 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b33a700652dc..ff4e2f9c24a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,8 +1067,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); -- cgit v1.2.3 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff4e2f9c24a7..7f5732f8c618 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1048,8 +1048,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1071,7 +1075,7 @@ struct sched_class { void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, -- cgit v1.2.3 From a8e4f2eaecc9bfa4954adf79a04f4f22fddd829c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:49 +0200 Subject: sched: Delay task_contributes_to_load() In prepratation of having to call task_contributes_to_load() without holding rq->lock, we need to store the result until we do and can update the rq accounting accordingly. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.151523907@chello.nl --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f5732f8c618..25c50317ddc1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1273,6 +1273,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; -- cgit v1.2.3 From 317f394160e9beb97d19a84c39b7e5eb3d7815a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:58 +0200 Subject: sched: Move the second half of ttwu() to the remote cpu Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i; done $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 647278 worker burns per second patched: run time 30 seconds 816715 worker burns per second Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.515897185@chello.nl --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25c50317ddc1..e09dafa6e149 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1203,6 +1203,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP + struct task_struct *wake_entry; int on_cpu; #endif int on_rq; @@ -2192,7 +2193,7 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -static inline void scheduler_ipi(void) { } +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } -- cgit v1.2.3 From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 22 Apr 2011 11:19:10 -0600 Subject: sched: Get rid of lock_depth Neil Brown pointed out that lock_depth somehow escaped the BKL removal work. Let's get rid of it now. Note that the perf scripting utilities still have a bunch of code for dealing with common_lock_depth in tracepoints; I have left that in place in case anybody wants to use that code with older kernels. Suggested-by: Neil Brown Signed-off-by: Jonathan Corbet Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 171ba24b08a7..013314a56105 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -731,10 +731,6 @@ struct sched_info { /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ -#ifdef CONFIG_SCHEDSTATS - /* BKL stats */ - unsigned int bkl_count; -#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -1190,8 +1186,6 @@ struct task_struct { unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; - int lock_depth; /* BKL lock depth */ - #ifdef CONFIG_SMP struct task_struct *wake_entry; int on_cpu; -- cgit v1.2.3 From bf26c018490c2fce7fe9b629083b96ce0e6ad019 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 Apr 2011 16:53:20 +0200 Subject: ptrace: Prepare to fix racy accesses on task breakpoints When a task is traced and is in a stopped state, the tracer may execute a ptrace request to examine the tracee state and get its task struct. Right after, the tracee can be killed and thus its breakpoints released. This can happen concurrently when the tracer is in the middle of reading or modifying these breakpoints, leading to dereferencing a freed pointer. Hence, to prepare the fix, create a generic breakpoint reference holding API. When a reference on the breakpoints of a task is held, the breakpoints won't be released until the last reference is dropped. After that, no more ptrace request on the task's breakpoints can be serviced for the tracer. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-2-git-send-email-fweisbec@gmail.com --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 18d63cea2848..781abd137673 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1537,6 +1537,9 @@ struct task_struct { unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; #endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT + atomic_t ptrace_bp_refcnt; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -- cgit v1.2.3 From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001 From: Samir Bellabes Date: Wed, 11 May 2011 18:18:05 +0200 Subject: sched: Remove unused parameters from sched_fork() and wake_up_new_task() sched_fork() and wake_up_new_task() are defined with a parameter 'unsigned long clone_flags', which is unused. This patch removes the parameters. Signed-off-by: Samir Bellabes Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6b4280b23ee6..12211e1666e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2051,14 +2051,13 @@ extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); -extern void wake_up_new_task(struct task_struct *tsk, - unsigned long clone_flags); +extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p, int clone_flags); +extern void sched_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); -- cgit v1.2.3 From 1399fa7807a1a5998bbf147e80668e9950661dfa Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:39 -0700 Subject: sched: Introduce SCHED_POWER_SCALE to scale cpu_power calculations SCHED_LOAD_SCALE is used to increase nice resolution and to scale cpu_power calculations in the scheduler. This patch introduces SCHED_POWER_SCALE and converts all uses of SCHED_LOAD_SCALE for scaling cpu_power to use SCHED_POWER_SCALE instead. This is a preparatory patch for increasing the resolution of SCHED_LOAD_SCALE, and there is no need to increase resolution for cpu_power calculations. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-3-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 12211e1666e2..f2f440221b70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -787,18 +787,21 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -/* - * sched-domains (multiprocessor balancing) declarations: - */ - /* * Increase resolution of nice-level calculations: */ #define SCHED_LOAD_SHIFT 10 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) -#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE +/* + * Increase resolution of cpu_power calculations + */ +#define SCHED_POWER_SHIFT 10 +#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +/* + * sched-domains (multiprocessor balancing) declarations: + */ #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ -- cgit v1.2.3 From c8b281161dfa4bb5d5be63fb036ce19347b88c63 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 14:37:48 -0700 Subject: sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f440221b70..c34a718e20dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -788,9 +788,28 @@ enum cpu_idle_type { }; /* - * Increase resolution of nice-level calculations: + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. */ -#define SCHED_LOAD_SHIFT 10 +#if BITS_PER_LONG > 32 +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) /* -- cgit v1.2.3 From 586692a5a5fc5740c8a46abc0f2365495c2d7c5f Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:22 -0700 Subject: watchdog: Disable watchdog when thresh is zero This restores the previous behavior of softlock_thresh. Currently, setting watchdog_thresh to zero causes the watchdog kthreads to consume a lot of CPU. In addition, the logic of proc_dowatchdog_thresh and proc_dowatchdog_enabled has been factored into proc_dowatchdog. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-3-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar LKML-Reference: <20110517071018.GE22305@elte.hu> --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 12211e1666e2..d8b2d0bec0d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -315,7 +315,6 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; -extern int softlockup_thresh; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) -- cgit v1.2.3 From 72788c385604523422592249c19cba0187021e9b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 24 May 2011 17:11:40 -0700 Subject: oom: replace PF_OOM_ORIGIN with toggling oom_score_adj There's a kernel-wide shortage of per-process flags, so it's always helpful to trim one when possible without incurring a significant penalty. It's even more important when you're planning on adding a per- process flag yourself, which I plan to do shortly for transparent hugepages. PF_OOM_ORIGIN is used by ksm and swapoff to prefer current since it has a tendency to allocate large amounts of memory and should be preferred for killing over other tasks. We'd rather immediately kill the task making the errant syscall rather than penalizing an innocent task. This patch removes PF_OOM_ORIGIN since its behavior is equivalent to setting the process's oom_score_adj to OOM_SCORE_ADJ_MAX. The process's old oom_score_adj is stored and then set to OOM_SCORE_ADJ_MAX during the time it used to have PF_OOM_ORIGIN. The old value is then reinstated when the process should no longer be considered a high priority for oom killing. Signed-off-by: David Rientjes Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Cc: Izik Eidus Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index aaf71e08222c..44b8faaac7c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1753,7 +1753,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_OOM_ORIGIN 0x00080000 /* Allocating much memory to others */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ -- cgit v1.2.3 From de03c72cfce5b263a674d04348b58475ec50163c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 May 2011 17:12:15 -0700 Subject: mm: convert mm->cpu_vm_cpumask into cpumask_var_t cpumask_t is very big struct and cpu_vm_mask is placed wrong position. It might lead to reduce cache hit ratio. This patch has two change. 1) Move the place of cpumask into last of mm_struct. Because usually cpumask is accessed only front bits when the system has cpu-hotplug capability 2) Convert cpu_vm_mask into cpumask_var_t. It may help to reduce memory footprint if cpumask_size() will use nr_cpumask_bits properly in future. In addition, this patch change the name of cpu_vm_mask with cpu_vm_mask_var. It may help to detect out of tree cpu_vm_mask users. This patch has no functional change. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KOSAKI Motohiro Cc: David Howells Cc: Koichi Yasutake Cc: Hugh Dickins Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 44b8faaac7c0..f18300eddfcb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2176,6 +2176,7 @@ static inline void mmdrop(struct mm_struct * mm) if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } +extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -- cgit v1.2.3 From b1cff0ad1062621ae63cb6c5dc4165191fe2e9f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 May 2011 14:27:43 -0400 Subject: ftrace: Add internal recursive checks Witold reported a reboot caused by the selftests of the dynamic function tracer. He sent me a config and I used ktest to do a config_bisect on it (as my config did not cause the crash). It pointed out that the problem config was CONFIG_PROVE_RCU. What happened was that if multiple callbacks are attached to the function tracer, we iterate a list of callbacks. Because the list is managed by synchronize_sched() and preempt_disable, the access to the pointers uses rcu_dereference_raw(). When PROVE_RCU is enabled, the rcu_dereference_raw() calls some debugging functions, which happen to be traced. The tracing of the debug function would then call rcu_dereference_raw() which would then call the debug function and then... well you get the idea. I first wrote two different patches to solve this bug. 1) add a __rcu_dereference_raw() that would not do any checks. 2) add notrace to the offending debug functions. Both of these patches worked. Talking with Paul McKenney on IRC, he suggested to add recursion detection instead. This seemed to be a better solution, so I decided to implement it. As the task_struct already has a trace_recursion to detect recursion in the ring buffer, and that has a very small number it allows, I decided to use that same variable to add flags that can detect the recursion inside the infrastructure of the function tracer. I plan to change it so that the task struct bit can be checked in mcount, but as that requires changes to all archs, I will hold that off to the next merge window. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Frederic Weisbecker Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1306348063.1465.116.camel@gandalf.stny.rr.com Reported-by: Witold Baryluk Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d8b2d0bec0d8..7b78d9cad471 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1513,7 +1513,7 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; - /* bitmask of trace recursion */ + /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ -- cgit v1.2.3 From 4714d1d32d97239fb5ae3e10521d3f133a899b66 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:18 -0700 Subject: cgroups: read-write lock CLONE_THREAD forking per threadgroup Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup Add an rwsem that lives in a threadgroup's signal_struct that's taken for reading in the fork path, under CONFIG_CGROUPS. If another part of the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS ifdefs should be changed to a higher-up flag that CGROUPS and the other system would both depend on. This is a pre-patch for cgroup-procs-write.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f18300eddfcb..dc8871295a5a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -513,6 +513,7 @@ struct thread_group_cputimer { spinlock_t lock; }; +#include struct autogroup; /* @@ -632,6 +633,16 @@ struct signal_struct { unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + /* + * The threadgroup_fork_lock prevents threads from forking with + * CLONE_THREAD while held for writing. Use this for fork-sensitive + * threadgroup-wide operations. It's taken for reading in fork.c in + * copy_process(). + * Currently only needed write-side by cgroups. + */ + struct rw_semaphore threadgroup_fork_lock; +#endif int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ @@ -2323,6 +2334,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk, spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } +/* See the declaration of threadgroup_fork_lock in signal_struct. */ +#ifdef CONFIG_CGROUPS +static inline void threadgroup_fork_read_lock(struct task_struct *tsk) +{ + down_read(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) +{ + up_read(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_write_lock(struct task_struct *tsk) +{ + down_write(&tsk->signal->threadgroup_fork_lock); +} +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) +{ + up_write(&tsk->signal->threadgroup_fork_lock); +} +#else +static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {} +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {} +static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {} +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {} +#endif + #ifndef __HAVE_THREAD_FUNCTIONS #define task_thread_info(task) ((struct thread_info *)(task)->stack) -- cgit v1.2.3 From 1e1b6c511d1b23cb7c3b619d82fc7bd9f620565d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 19 May 2011 15:08:58 +0900 Subject: cpuset: Fix cpuset_cpus_allowed_fallback(), don't update tsk->rt.nr_cpus_allowed The rule is, we have to update tsk->rt.nr_cpus_allowed if we change tsk->cpus_allowed. Otherwise RT scheduler may confuse. Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4DD4B3FA.5060901@jp.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index dc8871295a5a..8da84b7bc1b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1841,9 +1841,16 @@ static inline void rcu_copy_process(struct task_struct *p) #endif #ifdef CONFIG_SMP +extern void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); #else +static inline void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask) +{ +} static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { -- cgit v1.2.3 From 6345d24daf0c1fffe6642081d783cdf653ebaa5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 May 2011 11:32:28 -0700 Subject: mm: Fix boot crash in mm_alloc() Thomas Gleixner reports that we now have a boot crash triggered by CONFIG_CPUMASK_OFFSTACK=y: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] find_next_bit+0x55/0xb0 Call Trace: [] cpumask_any_but+0x2a/0x70 [] flush_tlb_mm+0x2b/0x80 [] pud_populate+0x35/0x50 [] pgd_alloc+0x9a/0xf0 [] mm_init+0xec/0x120 [] mm_alloc+0x53/0xd0 which was introduced by commit de03c72cfce5 ("mm: convert mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of mm_init() vs mm_init_cpumask Thomas wrote a patch to just fix the ordering of initialization, but I hate the new double allocation in the fork path, so I ended up instead doing some more radical surgery to clean it all up. Reported-by: Thomas Gleixner Reported-by: Ingo Molnar Cc: KOSAKI Motohiro Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index bcddd0138105..2a8621c4be1e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2194,7 +2194,6 @@ static inline void mmdrop(struct mm_struct * mm) if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } -extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm); /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -- cgit v1.2.3 From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 10:49:20 +0200 Subject: sched: Fix schedstat.nr_wakeups_migrate While looking over the code I found that with the ttwu rework the nr_wakeups_migrate test broke since we now switch cpus prior to calling ttwu_stat(), hence the test is always true. Cure this by passing the migration state in wake_flags. Also move the whole test under CONFIG_SMP, its hard to migrate tasks on UP :-) Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8da84b7bc1b8..483c1ed5bc4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1063,6 +1063,7 @@ struct sched_domain; */ #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x04 /* internal use, task got migrated */ #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 -- cgit v1.2.3