From 5b206d48e58204e84d249c4eb18651a1ff7a1274 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jul 2013 19:05:14 +0200 Subject: vtime: Update a few comments Update a stale comment from the old vtime era and document some locking that might be non obvious. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..223a35efa0a6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -712,6 +712,13 @@ void vtime_user_enter(struct task_struct *tsk) void vtime_guest_enter(struct task_struct *tsk) { + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; -- cgit v1.2.3 From 48d6a816a8bf36e2a197c322697323003bdc1cfe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Jul 2013 02:44:35 +0200 Subject: context_tracking: Optimize guest APIs off case with static key Optimize guest entry/exit APIs with static keys. This minimize the overhead for those who enable CONFIG_NO_HZ_FULL without always using it. Having no range passed to nohz_full= should result in the probes overhead to be minimized. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 223a35efa0a6..bb6b29a3067c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -724,6 +724,7 @@ void vtime_guest_enter(struct task_struct *tsk) current->flags |= PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { @@ -732,6 +733,7 @@ void vtime_guest_exit(struct task_struct *tsk) current->flags &= ~PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { -- cgit v1.2.3 From 7621d1f8bcb418e7a7ac583e89e38ec01b7ed182 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:07:35 +0200 Subject: vtime: Remove a few unneeded generic vtime state checks Some generic vtime APIs check if the vtime accounting is enabled on the local CPU before doing their work. Some of these are not needed because all their callers already take care of that. Let's remove the checks on these. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bb6b29a3067c..5f273b477764 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -664,9 +664,6 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); @@ -686,12 +683,7 @@ void vtime_account_irq_exit(struct task_struct *tsk) void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu; - - if (!vtime_accounting_enabled()) - return; - - delta_cpu = get_vtime_delta(tsk); + cputime_t delta_cpu = get_vtime_delta(tsk); write_seqlock(&tsk->vtime_seqlock); tsk->vtime_snap_whence = VTIME_SYS; @@ -701,9 +693,6 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); -- cgit v1.2.3 From 54461562c90e0ac104764c5a9de637fd9151a1c1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:10:18 +0200 Subject: vtime: Fix racy cputime delta update get_vtime_delta() must be called under the task vtime_seqlock with the code that does the cputime accounting flush. Otherwise the cputime reader can be fooled and run into a race where it sees the snapshot update but misses the cputime flush. As a result it can report a cputime that is way too short. Fix vtime_account_user() that wasn't complying to that rule. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5f273b477764..b62d5c027c7e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,9 +683,10 @@ void vtime_account_irq_exit(struct task_struct *tsk) void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(tsk); + cputime_t delta_cpu; write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); write_sequnlock(&tsk->vtime_seqlock); -- cgit v1.2.3 From b04934061330a4a449cfce703c97d887c3e11cd7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jul 2013 03:10:15 +0200 Subject: vtime: Optimize full dynticks accounting off case with static keys If no CPU is in the full dynticks range, we can avoid the full dynticks cputime accounting through generic vtime along with its overhead and use the traditional tick based accounting instead. Let's do this and nope the off case with static keys. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b62d5c027c7e..0831b06aab97 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ #ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_common_task_switch(struct task_struct *prev) { - if (!vtime_accounting_enabled()) - return; - if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_common_account_irq_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -669,11 +663,8 @@ void vtime_account_system(struct task_struct *tsk) write_sequnlock(&tsk->vtime_seqlock); } -void vtime_account_irq_exit(struct task_struct *tsk) +void vtime_gen_account_irq_exit(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; @@ -732,11 +723,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(delta_cpu); } -bool vtime_accounting_enabled(void) -{ - return context_tracking_active(); -} - void arch_vtime_task_switch(struct task_struct *prev) { write_seqlock(&prev->vtime_seqlock); -- cgit v1.2.3 From b854fafa4e06c50a92e00b39d75ee62083d986d6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:24:20 +0200 Subject: vtime: Always scale generic vtime accounting results The cputime accounting in full dynticks can be a subtle mixup of CPUs using tick based accounting and others using generic vtime. As long as the tick can have a share on producing these stats, we want to scale the result against CFS precise accounting as the tick can miss some task hiding between the periodic interrupt. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0831b06aab97..e9e742ed7280 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -553,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, { cputime_t rtime, stime, utime, total; - if (vtime_accounting_enabled()) { - *ut = curr->utime; - *st = curr->stime; - return; - } - stime = curr->stime; total = stime + curr->utime; -- cgit v1.2.3 From af2350bd12096dfd04e1090b90bfecea1f75f84e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Jul 2013 16:35:55 +0200 Subject: vtime: Always debug check snapshot source _before_ updating it The vtime delta update performed by get_vtime_delta() always check that the source of the snapshot is valid. Meanhile the snapshot updaters that rely on get_vtime_delta() also set the new snapshot origin. But some of them do this right before the call to get_vtime_delta(), making its debug check useless. This is easily fixable by moving the snapshot origin update after the call to get_vtime_delta(). The order doesn't matter there. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e9e742ed7280..c1d7493825ae 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -660,9 +660,9 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } @@ -680,8 +680,8 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { write_seqlock(&tsk->vtime_seqlock); - tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_sequnlock(&tsk->vtime_seqlock); } -- cgit v1.2.3 From a4f61cc03e443647211a5ae0ab8f8cda2e9e1043 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Aug 2013 15:38:24 +0000 Subject: sched/cputime: Use this_cpu_add() in task_group_account_field() Use of a this_cpu() operation reduces the number of instructions used for accounting (account_user_time()) and frees up some registers. This is in the scheduler tick hotpath. Signed-off-by: Christoph Lameter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/00000140596dd165-338ff7f5-893b-4fec-b251-aaac5557239e-000000@email.amazonses.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..e89ccefef278 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, * is the only cgroup, then nothing else should be necessary. * */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); cpuacct_account_field(p, index, tmp); } -- cgit v1.2.3 From 5a8e01f8fa51f5cbce8f37acc050eb2319d12956 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 4 Sep 2013 15:16:03 +0200 Subject: sched/cputime: Do not scale when utime == 0 scale_stime() silently assumes that stime < rtime, otherwise when stime == rtime and both values are big enough (operations on them do not fit in 32 bits), the resulting scaling stime can be bigger than rtime. In consequence utime = rtime - stime results in negative value. User space visible symptoms of the bug are overflowed TIME values on ps/top, for example: $ ps aux | grep rcu root 8 0.0 0.0 0 0 ? S 12:42 0:00 [rcuc/0] root 9 0.0 0.0 0 0 ? S 12:42 0:00 [rcub/0] root 10 62422329 0.0 0 0 ? R 12:42 21114581:37 [rcu_preempt] root 11 0.1 0.0 0 0 ? S 12:42 0:02 [rcuop/0] root 12 62422329 0.0 0 0 ? S 12:42 21114581:35 [rcuop/1] root 10 62422329 0.0 0 0 ? R 12:42 21114581:37 [rcu_preempt] or overflowed utime values read directly from /proc/$PID/stat Reference: https://lkml.org/lkml/2013/8/20/259 Reported-and-tested-by: Sergey Senozhatsky Signed-off-by: Stanislaw Gruszka Cc: stable@vger.kernel.org Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20130904131602.GC2564@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel/sched/cputime.c') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c1d7493825ae..5b03f5bebabc 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -551,10 +551,7 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, utime, total; - - stime = curr->stime; - total = stime + curr->utime; + cputime_t rtime, stime, utime; /* * Tick based cputime accounting depend on random scheduling @@ -576,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr, if (prev->stime + prev->utime >= rtime) goto out; - if (total) { + stime = curr->stime; + utime = curr->utime; + + if (utime == 0) { + stime = rtime; + } else if (stime == 0) { + utime = rtime; + } else { + cputime_t total = stime + utime; + stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)total); utime = rtime - stime; - } else { - stime = rtime; - utime = 0; } /* -- cgit v1.2.3