From 42c4ab41a176ee784c0f28c0b29025a8fc34f05a Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:26 +0200 Subject: itimers: Merge ITIMER_VIRT and ITIMER_PROF Both cpu itimers have same data flow in the few places, this patch make unification of code related with VIRT and PROF itimers. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-2-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..3b3efaddd953 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -470,6 +470,11 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +struct cpu_itimer { + cputime_t expires; + cputime_t incr; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units @@ -564,9 +569,12 @@ struct signal_struct { struct pid *leader_pid; ktime_t it_real_incr; - /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ - cputime_t it_prof_expires, it_virt_expires; - cputime_t it_prof_incr, it_virt_incr; + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. -- cgit v1.2.3 From 8356b5f9c424e5831715abbce747197c30d1fd71 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:27 +0200 Subject: itimers: Fix periodic tics precision Measure ITIMER_PROF and ITIMER_VIRT timers interval error between real ticks and requested by user. Take it into account when scheduling next tick. This patch introduce possibility where time between two consecutive tics is smaller then requested interval, it preserve however dependency that n tick is generated not earlier than n*interval time - counting from the beginning of periodic signal generation. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-3-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b3efaddd953..a069e65e8bb7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -473,6 +473,8 @@ struct pacct_struct { struct cpu_itimer { cputime_t expires; cputime_t incr; + u32 error; + u32 incr_error; }; /** -- cgit v1.2.3 From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:42:00 +0200 Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq() Rather ugly patch to fully place the sched_balance_self() code inside the fair class. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3d74bd04d18..5d3c9900943e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -811,6 +811,7 @@ enum cpu_idle_type { #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -1032,7 +1033,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sync); + int (*select_task_rq)(struct task_struct *p, int flag, int sync); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit v1.2.3 From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:43:03 +0200 Subject: sched: Add TASK_WAKING We're going to want to drop rq->lock in try_to_wake_up() for a longer period of time, however we also want to deal with concurrent waking of the same task, which is currently handled by holding rq->lock. So introduce a new TASK state, namely TASK_WAKING, which indicates someone is already waking the task (other wakers will fail p->state & state). We also keep preemption disabled over the whole ttwu(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d3c9900943e..3b0ca66bd6ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -190,6 +190,7 @@ extern unsigned long long time_sync_thresh; /* in tsk->state again */ #define TASK_DEAD 64 #define TASK_WAKEKILL 128 +#define TASK_WAKING 256 /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) -- cgit v1.2.3 From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b0ca66bd6ce..c30bf3d516d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -803,16 +803,15 @@ enum cpu_idle_type { #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ + #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ + #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ -- cgit v1.2.3 From 47fe38fcff0517e67d395c039d2e26d2de688a60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 13:49:18 +0200 Subject: x86: sched: Provide arch implementations using aperf/mperf APERF/MPERF support for cpu_power. APERF/MPERF is arch defined to be a relative scale of work capacity per logical cpu, this is assumed to include SMT and Turbo mode. APERF/MPERF are specified to both reset to 0 when either counter wraps, which is highly inconvenient, since that'll give a blimp when that happens. The manual specifies writing 0 to the counters after each read, but that's 1) too expensive, and 2) destroys the possibility of sharing these counters with other users, so we live with the blimp - the other existing user does too. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index c30bf3d516d1..fc4c0f9393d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -992,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag) return 0; } +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -1003,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new, } #endif /* !CONFIG_SMP */ + struct io_context; /* See blkdev.h */ -- cgit v1.2.3 From 0763a660a84220cc3900fd32abdd7ad109e2278d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:37:39 +0200 Subject: sched: Rename select_task_rq() argument In order to be able to rename the sync argument, we need to rename the current flag argument. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index fc4c0f9393d2..5c116f03d74c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1037,7 +1037,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int flag, int sync); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int sync); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit v1.2.3 From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:55:44 +0200 Subject: sched: Rename sync arguments In order to extend the functions to have more than 1 flag (sync), rename the argument to flags, and explicitly define a WF_ space for individual flags. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c116f03d74c..3b07168b6f03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1024,6 +1024,11 @@ struct uts_namespace; struct rq; struct sched_domain; +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakup */ + struct sched_class { const struct sched_class *next; @@ -1031,13 +1036,13 @@ struct sched_class { void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int sync); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit v1.2.3 From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 20:02:34 +0200 Subject: sched: Add WF_FORK Avoid the cache buddies from biasing the time distribution away from fork()ers. Normally the next buddy will be the preferred scheduling target, but this makes fork()s prefer to run the new child, whereas we prefer to run the parent, since that will generate more work. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b07168b6f03..ee1f88993097 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1028,6 +1028,7 @@ struct sched_domain; * wake flags */ #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ +#define WF_FORK 0x02 /* child wakeup after fork */ struct sched_class { const struct sched_class *next; -- cgit v1.2.3 From 59abf02644c45f1591e1374ee7bb45dc757fcb88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 08:28:30 +0200 Subject: sched: Add SD_PREFER_LOCAL And turn it on for NUMA and MC domains. This improves locality in balancing decisions by keeping up to capacity amount of tasks local before looking for idle CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE is set.) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ee1f88993097..b4a39bb2b4a4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -805,7 +805,7 @@ enum cpu_idle_type { #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ - +#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ -- cgit v1.2.3 From 4db96cf077aa938b11fe7ac79ecc9b29ec00fbab Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 16 Sep 2009 11:50:14 +0200 Subject: HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process This allows processes to override their early/late kill behaviour on hardware memory errors. Typically applications which are memory error aware is better of with early kill (see the error as soon as possible), all others with late kill (only see the error when the error is really impacting execution) There's a global sysctl, but this way an application can set its specific policy. We're using two bits, one to signify that the process stated its intention and that I also made the prctl future proof by enforcing the unused arguments are 0. The state is inherited to children. Note this makes us officially run out of process flags on 32bit, but the next patch can easily add another field. Manpage patch will be supplied separately. Signed-off-by: Andi Kleen --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3d74bd04d18..29eae73c951d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1687,6 +1687,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ @@ -1706,6 +1707,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ -- cgit v1.2.3 From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 12:31:31 +0200 Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING Create a new wakeup preemption mode, preempt towards tasks that run shorter on avg. It sets next buddy to be sure we actually run the task we preempted for. Test results: root@twins:~# while :; do :; done & [1] 6537 root@twins:~# while :; do :; done & [2] 6538 root@twins:~# while :; do :; done & [3] 6539 root@twins:~# while :; do :; done & [4] 6540 root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 4750 usec Avg 497 usec Stdev 737 usec root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 14 usec Avg 5 usec Stdev 3 usec Disabled by default - needs more testing. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4a39bb2b4a4..8af3d249170e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1113,6 +1113,8 @@ struct sched_entity { u64 start_runtime; u64 avg_wakeup; + u64 avg_running; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; -- cgit v1.2.3 From c3422bea5f09b0e85704f51f2b01271630b8940b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 13 Sep 2009 09:15:10 -0700 Subject: rcu: Simplify rcu_read_unlock_special() quiescent-state accounting The earlier approach required two scheduling-clock ticks to note an preemptable-RCU quiescent state in the situation in which the scheduling-clock interrupt is unlucky enough to always interrupt an RCU read-side critical section. With this change, the quiescent state is instead noted by the outermost rcu_read_unlock() immediately following the first scheduling-clock tick, or, alternatively, by the first subsequent context switch. Therefore, this change also speeds up grace periods. Suggested-by: Josh Triplett Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu LKML-Reference: <12528585111945-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3d74bd04d18..c62a9f84d614 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1740,7 +1740,6 @@ extern cputime_t task_gtime(struct task_struct *p); #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ -#define RCU_READ_UNLOCK_GOT_QS (1 << 2) /* CPU has responded to RCU core. */ static inline void rcu_copy_process(struct task_struct *p) { -- cgit v1.2.3 From 89f19f04dc72363d912fd007a399cb10310eff6e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 19 Sep 2009 11:55:44 -0700 Subject: sched: Fix raciness in runqueue_is_locked() runqueue_is_locked() is unavoidably racy due to a poor interface design. It does cpu = get_cpu() ret = some_perpcu_thing(cpu); put_cpu(cpu); return ret; Its return value is unreliable. Fix. Signed-off-by: Andrew Morton Acked-by: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <200909191855.n8JItiko022148@imap1.linux-foundation.org> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8af3d249170e..cc37a3fa5065 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -257,7 +257,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); -extern int runqueue_is_locked(void); +extern int runqueue_is_locked(int cpu); extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_var_t nohz_cpu_mask; -- cgit v1.2.3 From 0d721ceadbeaa24d7f9dd41b3e5e29912327a7e1 Mon Sep 17 00:00:00 2001 From: Peter Williams Date: Mon, 21 Sep 2009 01:31:53 +0000 Subject: sched: Simplify sys_sched_rr_get_interval() system call By removing the need for it to know details of scheduling classes. This allows PlugSched to define orthogonal scheduling classes. Signed-off-by: Peter Williams Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <06d1b89ee15a0eef82d7.1253496713@mudlark.pw.nest> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index cc37a3fa5065..239c8e0dba9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1075,6 +1075,8 @@ struct sched_class { void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio, int running); + unsigned int (*get_rr_interval) (struct task_struct *task); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*moved_group) (struct task_struct *p); #endif -- cgit v1.2.3 From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:02:48 +0200 Subject: perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Reviewed-by: Arjan van de Ven Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8af3d249170e..8b265a8986d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,7 +100,7 @@ struct robust_list_head; struct bio; struct fs_struct; struct bts_context; -struct perf_counter_context; +struct perf_event_context; /* * List of flags we want to share for kernel threads, @@ -701,7 +701,7 @@ struct user_struct { #endif #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif }; @@ -1449,10 +1449,10 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif -#ifdef CONFIG_PERF_COUNTERS - struct perf_counter_context *perf_counter_ctxp; - struct mutex perf_counter_mutex; - struct list_head perf_counter_list; +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ -- cgit v1.2.3 From f8af4da3b4c14e7267c4ffb952079af3912c51c5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Sep 2009 17:01:57 -0700 Subject: ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins Signed-off-by: Chris Wright Signed-off-by: Izik Eidus Cc: Michael Kerrisk Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Wu Fengguang Cc: Balbir Singh Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Avi Kivity Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fe351c3914a..8f3e63cb33a6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,7 +434,9 @@ extern int get_dumpable(struct mm_struct *mm); /* dumpable bits */ #define MMF_DUMPABLE 0 /* core dump is permitted */ #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ + #define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 @@ -444,6 +446,7 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 + #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 7 #define MMF_DUMP_FILTER_MASK \ @@ -457,6 +460,10 @@ extern int get_dumpable(struct mm_struct *mm); #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { atomic_t count; -- cgit v1.2.3 From 35451beecbd7c86ce3249d543594517a5fe9a0cd Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Sep 2009 17:02:27 -0700 Subject: ksm: unmerge is an origin of OOMs Just as the swapoff system call allocates many pages of RAM to various processes, perhaps triggering OOM, so "echo 2 >/sys/kernel/mm/ksm/run" (unmerge) is liable to allocate many pages of RAM to various processes, perhaps triggering OOM; and each is normally run from a modest admin process (swapoff or shell), easily repeated until it succeeds. So treat unmerge_and_remove_all_rmap_items() in the same way that we treat try_to_unuse(): generalize PF_SWAPOFF to PF_OOM_ORIGIN, and bracket both with that, to ask the OOM killer to kill them first, to prevent them from spawning more and more OOM kills. Signed-off-by: Hugh Dickins Acked-by: Izik Eidus Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8f3e63cb33a6..899d7304d594 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1720,7 +1720,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ #define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_OOM_ORIGIN 0x00080000 /* Allocating much memory to others */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ -- cgit v1.2.3 From 28b83c5193e7ab951e402252278f2cc79dc4d298 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:03:13 -0700 Subject: oom: move oom_adj value from task_struct to signal_struct Currently, OOM logic callflow is here. __out_of_memory() select_bad_process() for each task badness() calculate badness of one task oom_kill_process() search child oom_kill_task() kill target task and mm shared tasks with it example, process-A have two thread, thread-A and thread-B and it have very fat memory and each thread have following oom_adj and oom_score. thread-A: oom_adj = OOM_DISABLE, oom_score = 0 thread-B: oom_adj = 0, oom_score = very-high Then, select_bad_process() select thread-B, but oom_kill_task() refuse kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory() call select_bad_process() again. but select_bad_process() select the same task. It mean kernel fall in livelock. The fact is, select_bad_process() must select killable task. otherwise OOM logic go into livelock. And root cause is, oom_adj shouldn't be per-thread value. it should be per-process value because OOM-killer kill a process, not thread. Thus This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct signal_struct. it naturally prevent select_bad_process() choose wrong task. Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 899d7304d594..17e9a8e9a51d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -639,6 +639,8 @@ struct signal_struct { unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif + + int oom_adj; /* OOM kill score adjustment (bit shift) */ }; /* Context switch must be unlocked if interrupts are to be enabled */ @@ -1221,7 +1223,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif -- cgit v1.2.3 From 69d25870f20c4b2563304f2b79c5300dd60a067e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 21 Sep 2009 17:04:08 -0700 Subject: cpuidle: fix the menu governor to boost IO performance Fix the menu idle governor which balances power savings, energy efficiency and performance impact. The reason for a reworked governor is that there have been serious performance issues reported with the existing code on Nehalem server systems. To show this I'm sure Andrew wants to see benchmark results: (benchmark is "fio", "no cstates" is using "idle=poll") no cstates current linux new algorithm 1 disk 107 Mb/s 85 Mb/s 105 Mb/s 2 disks 215 Mb/s 123 Mb/s 209 Mb/s 12 disks 590 Mb/s 320 Mb/s 585 Mb/s In various power benchmark measurements, no degredation was found by our measurement&diagnostics team. Obviously a small percentage more power was used in the "fio" benchmark, due to the much higher performance. While it would be a novel idea to describe the new algorithm in this commit message, I cheaped out and described it in comments in the code instead. [changes since first post: spelling fixes from akpm, review feedback, folded menu-tng into menu.c] Signed-off-by: Arjan van de Ven Cc: Venkatesh Pallipadi Cc: Len Brown Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Yanmin Zhang Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 17e9a8e9a51d..97b10da0a3ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -140,6 +140,10 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +extern unsigned long nr_iowait_cpu(void); +extern unsigned long this_cpu_load(void); + + extern void calc_global_load(void); extern u64 cpu_nr_migrations(int cpu); -- cgit v1.2.3 From 1f10206cf8e945220f7220a809d8bfc15c21f9a5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 22 Sep 2009 16:44:10 -0700 Subject: getrusage: fill ru_maxrss value Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater mark. This struct is filled as a parameter to getrusage syscall. ->ru_maxrss value is set to KBs which is the way it is done in BSD systems. /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs which seems to be incorrect behavior. Maintainer of this util was notified by me with the patch which corrects it and cc'ed. To make this happen we extend struct signal_struct by two fields. The first one is ->maxrss which we use to store rss hiwater of the task. The second one is ->cmaxrss which we use to store highest rss hiwater of all task childs. These values are used in k_getrusage() to actually fill ->ru_maxrss. k_getrusage() uses current rss hiwater value directly if mm struct exists. Note: exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss. it is intetionally behavior. *BSD getrusage have exec() inheriting. test programs ======================================================== getrusage.c =========== #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #define err(str) perror(str), exit(1) int main(int argc, char** argv) { int status; printf("allocate 100MB\n"); consume(100); printf("testcase1: fork inherit? \n"); printf(" expect: initial.self ~= child.self\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase2: fork inherit? (cont.) \n"); printf(" expect: initial.children ~= 100MB, but child.children = 0\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("child"); _exit(0); } printf("\n"); printf("testcase3: fork + malloc \n"); printf(" expect: child.self ~= initial.self + 50MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { printf("allocate +50MB\n"); consume(50); show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase4: grandchild maxrss\n"); printf(" expect: post_wait.children ~= 300MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); show_rusage("post_wait"); } else { system("./child -n 0 -g 300"); _exit(0); } printf("\n"); printf("testcase5: zombie\n"); printf(" expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n"); printf(" post_wait ~= 400MB, IOW wait() collect child's max_rss. \n"); show_rusage("initial"); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("pre_wait"); wait(&status); show_rusage("post_wait"); } else { system("./child -n 400"); _exit(0); } printf("\n"); printf("testcase6: SIG_IGN\n"); printf(" expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n"); show_rusage("initial"); signal(SIGCHLD, SIG_IGN); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("after_zombie"); } else { system("./child -n 500"); _exit(0); } printf("\n"); signal(SIGCHLD, SIG_DFL); printf("testcase7: exec (without fork) \n"); printf(" expect: initial ~= exec \n"); show_rusage("initial"); execl("./child", "child", "-v", NULL); return 0; } child.c ======= #include #include #include #include #include #include #include #include #include #include #include "common.h" int main(int argc, char** argv) { int status; int c; long consume_size = 0; long grandchild_consume_size = 0; int show = 0; while ((c = getopt(argc, argv, "n:g:v")) != -1) { switch (c) { case 'n': consume_size = atol(optarg); break; case 'v': show = 1; break; case 'g': grandchild_consume_size = atol(optarg); break; default: break; } } if (show) show_rusage("exec"); if (consume_size) { printf("child alloc %ldMB\n", consume_size); consume(consume_size); } if (grandchild_consume_size) { if (fork()) { wait(&status); } else { printf("grandchild alloc %ldMB\n", grandchild_consume_size); consume(grandchild_consume_size); exit(0); } } return 0; } common.c ======== #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #define err(str) perror(str), exit(1) void show_rusage(char *prefix) { int err, err2; struct rusage rusage_self; struct rusage rusage_children; printf("%s: ", prefix); err = getrusage(RUSAGE_SELF, &rusage_self); if (!err) printf("self %ld ", rusage_self.ru_maxrss); err2 = getrusage(RUSAGE_CHILDREN, &rusage_children); if (!err2) printf("children %ld ", rusage_children.ru_maxrss); printf("\n"); } /* Some buggy OS need this worthless CPU waste. */ void make_pagefault(void) { void *addr; int size = getpagesize(); int i; for (i=0; i<1000; i++) { addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if (addr == MAP_FAILED) err("make_pagefault"); memset(addr, 0, size); munmap(addr, size); } } void consume(int mega) { size_t sz = mega * 1024 * 1024; void *ptr; ptr = malloc(sz); memset(ptr, 0, sz); make_pagefault(); } pid_t __fork(void) { pid_t pid; pid = fork(); make_pagefault(); return pid; } common.h ======== void show_rusage(char *prefix); void make_pagefault(void); void consume(int mega); pid_t __fork(void); FreeBSD result (expected result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 103492 children 0 fork child: self 103540 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 103540 children 103540 child: self 103564 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 103564 children 103564 allocate +50MB fork child: self 154860 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 103564 children 154860 grandchild alloc 300MB post_wait: self 103564 children 308720 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 103564 children 308720 child alloc 400MB pre_wait: self 103564 children 308720 post_wait: self 103564 children 411312 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 103564 children 411312 child alloc 500MB after_zombie: self 103624 children 411312 testcase7: exec (without fork) expect: initial ~= exec initial: self 103624 children 411312 exec: self 103624 children 411312 Linux result (actual test result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 102848 children 0 fork child: self 102572 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 102876 children 102644 child: self 102572 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 102876 children 102644 allocate +50MB fork child: self 153804 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 102876 children 153864 grandchild alloc 300MB post_wait: self 102876 children 307536 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 102876 children 307536 child alloc 400MB pre_wait: self 102876 children 307536 post_wait: self 102876 children 410076 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 102876 children 410076 child alloc 500MB after_zombie: self 102880 children 410076 testcase7: exec (without fork) expect: initial ~= exec initial: self 102880 children 410076 exec: self 102880 children 410076 Signed-off-by: Jiri Pirko Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 97b10da0a3ea..6448bbc6406b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -426,6 +426,15 @@ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) return max(mm->hiwater_rss, get_mm_rss(mm)); } +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, + struct mm_struct *mm) +{ + unsigned long hiwater_rss = get_mm_hiwater_rss(mm); + + if (*maxrss < hiwater_rss) + *maxrss = hiwater_rss; +} + static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); @@ -612,6 +621,7 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; + unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* -- cgit v1.2.3 From d899bf7b55f503ba7d3d07ed27c3a37e270fa7db Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 22 Sep 2009 16:45:40 -0700 Subject: procfs: provide stack information for threads A patch to give a better overview of the userland application stack usage, especially for embedded linux. Currently you are only able to dump the main process/thread stack usage which is showed in /proc/pid/status by the "VmStk" Value. But you get no information about the consumed stack memory of the the threads. There is an enhancement in the /proc//{task/*,}/*maps and which marks the vm mapping where the thread stack pointer reside with "[thread stack xxxxxxxx]". xxxxxxxx is the maximum size of stack. This is a value information, because libpthread doesn't set the start of the stack to the top of the mapped area, depending of the pthread usage. A sample output of /proc//task//maps looks like: 08048000-08049000 r-xp 00000000 03:00 8312 /opt/z 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/z 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] a7d12000-a7d13000 ---p 00000000 00:00 0 a7d13000-a7f13000 rw-p 00000000 00:00 0 [thread stack: 001ff4b4] a7f13000-a7f14000 ---p 00000000 00:00 0 a7f14000-a7f36000 rw-p 00000000 00:00 0 a7f36000-a8069000 r-xp 00000000 03:00 4222 /lib/libc.so.6 a8069000-a806b000 r--p 00133000 03:00 4222 /lib/libc.so.6 a806b000-a806c000 rw-p 00135000 03:00 4222 /lib/libc.so.6 a806c000-a806f000 rw-p 00000000 00:00 0 a806f000-a8083000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0 a8083000-a8084000 r--p 00013000 03:00 14462 /lib/libpthread.so.0 a8084000-a8085000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0 a8085000-a8088000 rw-p 00000000 00:00 0 a8088000-a80a4000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2 a80a4000-a80a5000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2 a80a5000-a80a6000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2 afaf5000-afb0a000 rw-p 00000000 00:00 0 [stack] ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso] Also there is a new entry "stack usage" in /proc//{task/*,}/status which will you give the current stack usage in kb. A sample output of /proc/self/status looks like: Name: cat State: R (running) Tgid: 507 Pid: 507 . . . CapBnd: fffffffffffffeff voluntary_ctxt_switches: 0 nonvoluntary_ctxt_switches: 0 Stack usage: 12 kB I also fixed stack base address in /proc//{task/*,}/stat to the base address of the associated thread stack and not the one of the main process. This makes more sense. [akpm@linux-foundation.org: fs/proc/array.c now needs walk_page_range()] Signed-off-by: Stefani Seibold Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6448bbc6406b..3cbc6c0be666 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1529,6 +1529,7 @@ struct task_struct { /* bitmask of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ + unsigned long stack_start; }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -- cgit v1.2.3 From e0ad955680878998ff7dc51ce06ddad12260423a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:38 -0600 Subject: cpumask: don't define set_cpus_allowed() if CONFIG_CPUMASK_OFFSTACK=y You're not supposed to pass cpumasks on the stack in that case. Signed-off-by: Rusty Russell --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbf2a3b46280..848d1f20086e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1817,10 +1817,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, return 0; } #endif + +#ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) { return set_cpus_allowed_ptr(p, &new_mask); } +#endif /* * Architectures can set this to 1 if they have specified -- cgit v1.2.3 From a7f0765edfd53aed09cb7b0e15863688b39447de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:44 -0700 Subject: ptrace: __ptrace_detach: do __wake_up_parent() if we reap the tracee The bug is old, it wasn't cause by recent changes. Test case: static void *tfunc(void *arg) { int pid = (long)arg; assert(ptrace(PTRACE_ATTACH, pid, NULL, NULL) == 0); kill(pid, SIGKILL); sleep(1); return NULL; } int main(void) { pthread_t th; long pid = fork(); if (!pid) pause(); signal(SIGCHLD, SIG_IGN); assert(pthread_create(&th, NULL, tfunc, (void*)pid) == 0); int r = waitpid(-1, NULL, __WNOTHREAD); printf("waitpid: %d %m\n", r); return 0; } Before the patch this program hangs, after this patch waitpid() correctly fails with errno == -ECHILD. The problem is, __ptrace_detach() reaps the EXIT_ZOMBIE tracee if its ->real_parent is our sub-thread and we ignore SIGCHLD. But in this case we should wake up other threads which can sleep in do_wait(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 848d1f20086e..9e5a88afe6be 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2059,6 +2059,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); extern int do_notify_parent(struct task_struct *, int); +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); -- cgit v1.2.3 From d9588725e52650e82989707f8fd2feb67ad2dc8e Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 23 Sep 2009 15:57:04 -0700 Subject: signals: inline __fatal_signal_pending __fatal_signal_pending inlines to one instruction on x86, probably two instructions on other machines. It takes two longer x86 instructions just to call it and test its return value, not to mention the function itself. On my random x86_64 config, this saved 70 bytes of text (59 of those being __fatal_signal_pending itself). Signed-off-by: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e5a88afe6be..e951bd2eb9fc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2337,7 +2337,10 @@ static inline int signal_pending(struct task_struct *p) return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } -extern int __fatal_signal_pending(struct task_struct *p); +static inline int __fatal_signal_pending(struct task_struct *p) +{ + return unlikely(sigismember(&p->pending.signal, SIGKILL)); +} static inline int fatal_signal_pending(struct task_struct *p) { -- cgit v1.2.3 From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 23 Sep 2009 15:57:19 -0700 Subject: sysctl: remove "struct file *" argument of ->proc_handler It's unused. It isn't needed -- read or write flag is already passed and sysctl shouldn't care about the rest. It _was_ used in two places at arch/frv for some reason. Signed-off-by: Alexey Dobriyan Cc: David Howells Cc: "Eric W. Biederman" Cc: Al Viro Cc: Ralf Baechle Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "David S. Miller" Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e951bd2eb9fc..811cd96524d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -309,7 +309,7 @@ extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; @@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -1906,7 +1906,7 @@ extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos); #endif #ifdef CONFIG_SCHED_DEBUG @@ -1924,7 +1924,7 @@ extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int sysctl_sched_compat_yield; -- cgit v1.2.3 From 801460d0cf5c5288153b722565773059b0f44348 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 23 Sep 2009 15:57:41 -0700 Subject: task_struct cleanup: move binfmt field to mm_struct Because the binfmt is not different between threads in the same process, it can be moved from task_struct to mm_struct. And binfmt moudle is handled per mm_struct instead of task_struct. Signed-off-by: Hiroshi Shimamoto Acked-by: Oleg Nesterov Cc: Rusty Russell Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 811cd96524d7..8a16f6d11dcd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1271,7 +1271,6 @@ struct task_struct { struct mm_struct *mm, *active_mm; /* task state */ - struct linux_binfmt *binfmt; int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ -- cgit v1.2.3