From 87b526d349b04c31d7b3a40b434eb3f825d22305 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Oct 2012 11:40:45 -0700 Subject: seccomp: Make syscall skipping and nr changes more consistent This fixes two issues that could cause incompatibility between kernel versions: - If a tracer uses SECCOMP_RET_TRACE to select a syscall number higher than the largest known syscall, emulate the unknown vsyscall by returning -ENOSYS. (This is unlikely to make a noticeable difference on x86-64 due to the way the system call entry works.) - On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy. This updates the documentation accordingly. Signed-off-by: Andy Lutomirski Acked-by: Will Drewry Signed-off-by: James Morris --- kernel/seccomp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf9..5af44b593770 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; + struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, regs, -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, regs); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); goto skip; + } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; + if (syscall_get_nr(current, regs) < 0) + goto skip; /* Explicit request to skip. */ + return 0; case SECCOMP_RET_ALLOW: return 0; -- cgit v1.2.3 From 3a50597de8635cd05133bd12c95681c82fe7b878 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Oct 2012 19:24:29 +0100 Subject: KEYS: Make the session and process keyrings per-thread Make the session keyring per-thread rather than per-process, but still inherited from the parent thread to solve a problem with PAM and gdm. The problem is that join_session_keyring() will reject attempts to change the session keyring of a multithreaded program but gdm is now multithreaded before it gets to the point of starting PAM and running pam_keyinit to create the session keyring. See: https://bugs.freedesktop.org/show_bug.cgi?id=49211 The reason that join_session_keyring() will only change the session keyring under a single-threaded environment is that it's hard to alter the other thread's credentials to effect the change in a multi-threaded program. The problems are such as: (1) How to prevent two threads both running join_session_keyring() from racing. (2) Another thread's credentials may not be modified directly by this process. (3) The number of threads is uncertain whilst we're not holding the appropriate spinlock, making preallocation slightly tricky. (4) We could use TIF_NOTIFY_RESUME and key_replace_session_keyring() to get another thread to replace its keyring, but that means preallocating for each thread. A reasonable way around this is to make the session keyring per-thread rather than per-process and just document that if you want a common session keyring, you must get it before you spawn any threads - which is the current situation anyway. Whilst we're at it, we can the process keyring behave in the same way. This means we can clean up some of the ickyness in the creds code. Basically, after this patch, the session, process and thread keyrings are about inheritance rules only and not about sharing changes of keyring. Reported-by: Mantas M. Signed-off-by: David Howells Tested-by: Ray Strode --- kernel/cred.c | 127 +++++++--------------------------------------------------- 1 file changed, 15 insertions(+), 112 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..3f7ad1ec2ae4 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,17 +29,6 @@ static struct kmem_cache *cred_jar; -/* - * The common credentials for the initial task's thread group - */ -#ifdef CONFIG_KEYS -static struct thread_group_cred init_tgcred = { - .usage = ATOMIC_INIT(2), - .tgid = 0, - .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), -}; -#endif - /* * The initial credentials for the initial task */ @@ -65,9 +54,6 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, -#ifdef CONFIG_KEYS - .tgcred = &init_tgcred, -#endif }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -95,36 +81,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) #endif } -/* - * Dispose of the shared task group credentials - */ -#ifdef CONFIG_KEYS -static void release_tgcred_rcu(struct rcu_head *rcu) -{ - struct thread_group_cred *tgcred = - container_of(rcu, struct thread_group_cred, rcu); - - BUG_ON(atomic_read(&tgcred->usage) != 0); - - key_put(tgcred->session_keyring); - key_put(tgcred->process_keyring); - kfree(tgcred); -} -#endif - -/* - * Release a set of thread group credentials. - */ -static void release_tgcred(struct cred *cred) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = cred->tgcred; - - if (atomic_dec_and_test(&tgcred->usage)) - call_rcu(&tgcred->rcu, release_tgcred_rcu); -#endif -} - /* * The RCU callback to actually dispose of a set of credentials */ @@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) #endif security_cred_free(cred); + key_put(cred->session_keyring); + key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); - release_tgcred(cred); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); @@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; -#ifdef CONFIG_KEYS - new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); - if (!new->tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } - atomic_set(&new->tgcred->usage, 1); -#endif - atomic_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; @@ -308,9 +256,10 @@ struct cred *prepare_creds(void) get_user_ns(new->user_ns); #ifdef CONFIG_KEYS + key_get(new->session_keyring); + key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); - atomic_inc(&new->tgcred->usage); #endif #ifdef CONFIG_SECURITY @@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); */ struct cred *prepare_exec_creds(void) { - struct thread_group_cred *tgcred = NULL; struct cred *new; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return NULL; -#endif - new = prepare_creds(); - if (!new) { - kfree(tgcred); + if (!new) return new; - } #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; #endif return new; @@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif struct cred *new; int ret; @@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) install_thread_keyring_to_cred(new); } - /* we share the process and session keyrings between all the threads in - * a process - this is slightly icky as we violate COW credentials a - * bit */ + /* The process keyring is only shared between the threads in a process; + * anything outside of those threads doesn't inherit. + */ if (!(clone_flags & CLONE_THREAD)) { - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - ret = -ENOMEM; - goto error_put; - } - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = key_get(new->tgcred->session_keyring); - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; } #endif @@ -643,9 +560,6 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif const struct cred *old; struct cred *new; @@ -653,14 +567,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } -#endif - kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -678,13 +584,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = NULL; - new->tgcred = tgcred; - new->request_key_auth = NULL; + new->session_keyring = NULL; + new->process_keyring = NULL; new->thread_keyring = NULL; + new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif -- cgit v1.2.3 From 5bb962269c29cbb878414cddf0ebdff8c5cdef0a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 02:03:27 +0200 Subject: tick: Consolidate timekeeping handling code Unify the duplicated timekeeping handling code of low and high res tick sched handlers. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 54 +++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..360674c485f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -98,6 +98,28 @@ static ktime_t tick_init_jiffy_update(void) return period; } + +static void tick_sched_do_timer(ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); +} + /* * NOHZ - aka dynamic tick functionality */ @@ -648,24 +670,11 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * When we are idle and the tick is stopped, we have to touch @@ -802,23 +811,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ - /* - * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to - * this duty, then the jiffies update is still serialized by - * xtime_lock. - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) - tick_do_timer_cpu = cpu; -#endif - - /* Check, if the jiffies need an update */ - if (tick_do_timer_cpu == cpu) - tick_do_update_jiffies64(now); + tick_sched_do_timer(now); /* * Do not call, when we are not in irq context and have -- cgit v1.2.3 From 9e8f559b08cbc1cfcbf093840a2a760a946cb90f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 02:43:03 +0200 Subject: tick: Consolidate tick handling for high and low res handlers Besides unifying code, this also adds the idle check before processing idle accounting specifics on the low res handler. This way we also generalize this part of the nohz code for !CONFIG_HIGH_RES_TIMERS to prepare for the adaptive tickless features. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 55 +++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 360674c485f5..68a873af09a8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -120,6 +120,25 @@ static void tick_sched_do_timer(ktime_t now) tick_do_update_jiffies64(now); } +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + if (is_idle_task(current)) + ts->idle_jiffies++; + } + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} + /* * NOHZ - aka dynamic tick functionality */ @@ -675,22 +694,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; tick_sched_do_timer(now); - - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start - * of idle" jiffy stamp so the idle accounting adjustment we - * do when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - ts->idle_jiffies++; - } - - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); + tick_sched_handle(ts, regs); while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); @@ -818,23 +822,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * Do not call, when we are not in irq context and have * no valid regs pointer */ - if (regs) { - /* - * When we are idle and the tick is stopped, we have to touch - * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while - * waiting on the login prompt. We also increment the "start of - * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. - */ - if (ts->tick_stopped) { - touch_softlockup_watchdog(); - if (is_idle_task(current)) - ts->idle_jiffies++; - } - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING); - } + if (regs) + tick_sched_handle(ts, regs); hrtimer_forward(timer, now, tick_period); -- cgit v1.2.3 From 94a571402012e0dfaa23bbbdd64d033f48477d86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Oct 2012 16:17:16 +0200 Subject: tick: Conditionally build nohz specific code in tick handler This optimize a bit the high res tick sched handler. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 68a873af09a8..766d4c47a4a4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -122,6 +122,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { +#ifdef CONFIG_NO_HZ /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -135,6 +136,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) if (is_idle_task(current)) ts->idle_jiffies++; } +#endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } -- cgit v1.2.3 From 8ed92e51f99c2199c64cb33b4ba95ab12940a94c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Oct 2012 14:28:50 +0200 Subject: sched: Add WAKEUP_PREEMPTION feature flag, on by default As per the recent discussion with Mike and Linus, make it easier to test with/without this feature. No change in default behavior. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-izoxq4haeg4mTognnDbwcevt@git.kernel.org --- kernel/sched/fair.c | 2 +- kernel/sched/features.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..f936552b3db1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2907,7 +2907,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL)) + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..e68e69ab917d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -31,6 +31,11 @@ SCHED_FEAT(LAST_BUDDY, true) */ SCHED_FEAT(CACHE_HOT_BUDDY, true) +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + /* * Use arch dependent cpu power functions */ -- cgit v1.2.3 From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set cgroup core has a bug which violates a basic rule about event notifications - when a new entity needs to be added, you add that to the notification list first and then make the new entity conform to the current state. If done in the reverse order, an event happening inbetween will be lost. cgroup_subsys->fork() is invoked way before the new task is added to the css_set. Currently, cgroup_freezer is the only user of ->fork() and uses it to make new tasks conform to the current state of the freezer. If FROZEN state is requested while fork is in progress between cgroup_fork_callbacks() and cgroup_post_fork(), the child could escape freezing - the cgroup isn't frozen when ->fork() is called and the freezer couldn't see the new task on the css_set. This patch moves cgroup_subsys->fork() invocation to cgroup_post_fork() after the new task is added to the css_set. cgroup_fork_callbacks() is removed. Because now a task may be migrated during cgroup_subsys->fork(), freezer_fork() is updated so that it adheres to the usual RCU locking and the rather pointless comment on why locking can be different there is removed (if it doesn't make anything simpler, why even bother?). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/cgroup.c | 62 ++++++++++++++++++++++++------------------------- kernel/cgroup_freezer.c | 13 ++++------- kernel/fork.c | 9 +------ 3 files changed, 35 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39aa..b7a0171067ea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4843,45 +4843,20 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -/** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * forkexit callbacks are only supported for - * builtin subsystems. - */ - if (!ss || ss->module) - continue; - - if (ss->fork) - ss->fork(child); - } - } -} - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4910,7 +4885,30 @@ void cgroup_post_fork(struct task_struct *child) } write_unlock(&css_set_lock); } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + /* + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. + */ + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); + } + } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce98981..12bfedb598c2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -186,23 +186,15 @@ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - /* - * No lock is needed, since the task isn't on tasklist yet, - * so it can't be moved to another cgroup, which means the - * freezer won't be removed and will be valid during this - * function call. Nevertheless, apply RCU read-side critical - * section to suppress RCU lockdep false positives. - */ rcu_read_lock(); freezer = task_freezer(task); - rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the * following check. */ if (!freezer->css.cgroup->parent) - return; + goto out; spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -210,7 +202,10 @@ static void freezer_fork(struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) freeze_task(task); + spin_unlock_irq(&freezer->lock); +out: + rcu_read_unlock(); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..acc4cb62f32f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); + cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: -- cgit v1.2.3 From 51f246ed95caed12898649d8170d2d352da6af76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: make it official that writes to freezer.state don't fail try_to_freeze_cgroup() has condition checks which are intended to fail the write operation to freezer.state if there are tasks which can't be frozen. The condition checks have been broken for quite some time now. freeze_task() returns %false if the target task can't be frozen, so num_cant_freeze_now is never incremented. In addition, strangely, cgroup freezing proceeds even after the write is failed, which is rather broken. This patch rips out the non-working code intended to fail the write to freezer.state when the cgroup contains non-freezable tasks and makes it official that writes to freezer.state succeed whether there are non-freezable tasks in the cgroup or not. This leaves is_task_frozen_enough() with only one user - upste_if_frozen(). Collapse it into the caller. Note that this removes an extra call to freezing(). This doesn't cause any userland behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki --- kernel/cgroup_freezer.c | 43 +++++++++++-------------------------------- 1 file changed, 11 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 12bfedb598c2..05d52185139c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -150,13 +150,6 @@ static void freezer_destroy(struct cgroup *cgroup) kfree(freezer); } -/* task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -222,7 +215,8 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (freezing(task) && is_task_frozen_enough(task)) + if (freezing(task) && (frozen(task) || + task_is_stopped_or_traced(task))) nfrozen++; } @@ -264,24 +258,15 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, return 0; } -static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int num_cant_freeze_now = 0; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task)) - continue; - if (is_task_frozen_enough(task)) - continue; - if (!freezing(task) && !freezer_should_skip(task)) - num_cant_freeze_now++; - } + while ((task = cgroup_iter_next(cgroup, &it))) + freeze_task(task); cgroup_iter_end(cgroup, &it); - - return num_cant_freeze_now ? -EBUSY : 0; } static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) @@ -295,13 +280,10 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static int freezer_change_state(struct cgroup *cgroup, - enum freezer_state goal_state) +static void freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) { - struct freezer *freezer; - int retval = 0; - - freezer = cgroup_freezer(cgroup); + struct freezer *freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); @@ -318,22 +300,19 @@ static int freezer_change_state(struct cgroup *cgroup, if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; - retval = try_to_freeze_cgroup(cgroup, freezer); + freeze_cgroup(cgroup, freezer); break; default: BUG(); } spin_unlock_irq(&freezer->lock); - - return retval; } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - int retval; enum freezer_state goal_state; if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) @@ -345,9 +324,9 @@ static int freezer_write(struct cgroup *cgroup, if (!cgroup_lock_live_group(cgroup)) return -ENODEV; - retval = freezer_change_state(cgroup, goal_state); + freezer_change_state(cgroup, goal_state); cgroup_unlock(); - return retval; + return 0; } static struct cftype files[] = { -- cgit v1.2.3 From 3c426d5e114035d00453bb5d82a92826db1ed71f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: don't stall transition to FROZEN for PF_NOFREEZE or PF_FREEZER_SKIP tasks cgroup_freezer doesn't transition from FREEZING to FROZEN if the cgroup contains PF_NOFREEZE tasks or tasks sleeping with PF_FREEZER_SKIP set. Only kernel tasks can be non-freezable (PF_NOFREEZE) and there's nothing cgroup_freezer or userland can do about or to it. It's pointless to stall the transition for PF_NOFREEZE tasks. PF_FREEZER_SKIP indicates that the task can be skipped when determining whether frozen state is reached. A task with PF_FREEZER_SKIP is guaranteed to perform try_to_freeze() after it wakes up and can be considered frozen much like stopped or traced tasks. Note that a vfork parent uses PF_FREEZER_SKIP while waiting for the child. This updates update_if_frozen() such that it only considers freezable tasks and treats %true freezer_should_skip() tasks as frozen. This allows cgroups w/ kthreads and vfork parents successfully reach FROZEN state. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki --- kernel/cgroup_freezer.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 05d52185139c..557f3678c4e4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -214,10 +214,18 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (freezing(task) && (frozen(task) || - task_is_stopped_or_traced(task))) - nfrozen++; + if (freezing(task)) { + ntotal++; + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (frozen(task) || task_is_stopped_or_traced(task) || + freezer_should_skip(task)) + nfrozen++; + } } if (old_state == CGROUP_THAWED) { -- cgit v1.2.3 From 62da1921292ef789c23a7bf01d671d7572baf377 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Sep 2012 16:02:49 -0700 Subject: rcu: Accelerate callbacks for CPU initiating a grace period Because grace-period initialization is carried out by a separate kthread, it might happen on a different CPU than the one that had the callback needing a grace period -- which is where the callback acceleration needs to happen. Fortunately, rcu_start_gp() holds the root rcu_node structure's ->lock, which prevents a new grace period from starting. This allows this function to safely determine that a grace period has not yet started, which in turn allows it to fully accelerate any callbacks that it has pending. This commit adds this acceleration. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..93d6871bf7f9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1404,15 +1404,37 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period - * task or this CPU does not need another grace period. + * task, this CPU does not need another grace period, + * or a grace period is already in progress. * Either way, don't start a new grace period. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } + /* + * Because there is no grace period in progress right now, + * any callbacks we have up to this point will be satisfied + * by the next grace period. So promote all callbacks to be + * handled after the end of the next grace period. If the + * CPU is not yet aware of the end of the previous grace period, + * we need to allow for the callback advancement that will + * occur when it does become aware. Deadlock prevents us from + * making it aware at this point: We cannot acquire a leaf + * rcu_node ->lock while holding the root rcu_node ->lock. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + if (rdp->completed == rsp->completed) + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + + /* Ensure that CPU is aware of completion of last grace period. */ + rcu_process_gp_end(rsp, rdp); + local_irq_restore(flags); + + /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } -- cgit v1.2.3 From 8755ade683241e8c6b8fe8d22d0ae35041a3dc51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: allow moving tasks in and out of a frozen cgroup cgroup_freezer is one of the few users of cgroup_subsys->can_attach() and uses it to prevent tasks from being migrated into or out of a frozen cgroup. This makes cgroup_freezer cumbersome to use especially when co-mounted with other controllers. ->can_attach() is problematic in general as it can make co-mounting multiple cgroups difficult - migrating tasks may fail for reasons completely irrelevant for other controllers. freezer_can_attach() in particular is more problematic because it messes with cgroup internal locking to ensure that the state verification performed at freezer_can_attach() stays valid until migration is complete. This patch replaces freezer_can_attach() with freezer_attach() so that tasks are always allowed to migrate - they are nudged into the conforming state from freezer_attach(). This means that there can be tasks which are being migrated which don't conform to the current cgroup_freezer state until freezer_attach() is complete. Under the current locking scheme, the only such place is freezer_fork() which is updated to handle such window. While this patch doesn't remove the use of internal cgroup locking from freezer_read/write() paths, it removes the requirement to keep the freezer state constant while migrating and enables such change. Note that this creates a userland visible behavior change - FROZEN cgroup can no longer be used to lock migrations in and out of the cgroup. This behavior change is intended. I don't think the feature is necessary - userland should coordinate accesses to cgroup fs anyway - and even if the feature is needed cgroup_freezer is the completely wrong place to implement it. Signed-off-by: Tejun Heo LKML-Reference: <1350426526-14254-1-git-send-email-tj@kernel.org> Cc: Matt Helsley Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 51 ++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 557f3678c4e4..0b0e10545ef0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -152,27 +152,38 @@ static void freezer_destroy(struct cgroup *cgroup) /* * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. + * a write to that file racing against an attach, and hence we don't need + * to worry about racing against migration. */ -static int freezer_can_attach(struct cgroup *new_cgroup, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + spin_lock_irq(&freezer->lock); + /* - * Anything frozen can't move or be moved to/from. + * Make the new tasks conform to the current state of @new_cgrp. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_cgrp but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + * This means that, to determine whether to freeze, one should test + * whether the state equals THAWED. */ - cgroup_taskset_for_each(task, new_cgroup, tset) - if (cgroup_freezing(task)) - return -EBUSY; - - freezer = cgroup_freezer(new_cgroup); - if (freezer->state != CGROUP_THAWED) - return -EBUSY; + cgroup_taskset_for_each(task, new_cgrp, tset) { + if (freezer->state == CGROUP_THAWED) { + __thaw_task(task); + } else { + freeze_task(task); + freezer->state = CGROUP_FREEZING; + } + } - return 0; + spin_unlock_irq(&freezer->lock); } static void freezer_fork(struct task_struct *task) @@ -190,12 +201,12 @@ static void freezer_fork(struct task_struct *task) goto out; spin_lock_irq(&freezer->lock); - BUG_ON(freezer->state == CGROUP_FROZEN); - - /* Locking avoids race with FREEZING -> THAWED transitions. */ - if (freezer->state == CGROUP_FREEZING) + /* + * @task might have been just migrated into a FROZEN cgroup. Test + * equality with THAWED. Read the comment in freezer_attach(). + */ + if (freezer->state != CGROUP_THAWED) freeze_task(task); - spin_unlock_irq(&freezer->lock); out: rcu_read_unlock(); @@ -352,7 +363,7 @@ struct cgroup_subsys freezer_subsys = { .create = freezer_create, .destroy = freezer_destroy, .subsys_id = freezer_subsys_id, - .can_attach = freezer_can_attach, + .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, -- cgit v1.2.3 From b4d18311d37b0b1b370a1ef3e4de92b97930f0a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: prepare update_if_frozen() for locking change Locking will change such that migration can happen while freezer_read/write() is in progress. This means that update_if_frozen() can no longer assume that all tasks in the cgroup coform to the current freezer state - newly migrated tasks which haven't finished freezer_attach() yet might be in any state. This patch updates update_if_frozen() such that it no longer verifies task states against freezer state. It now simply decides whether FREEZING stage is complete. This removal of verification makes it meaningless to call from freezer_change_state(). Drop it and move the fast exit test from freezer_read() - the only left caller - to update_if_frozen(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 0b0e10545ef0..3d45503a21a2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -213,41 +213,39 @@ out: } /* - * caller must hold freezer->lock + * We change from FREEZING to FROZEN lazily if the cgroup was only + * partially frozen when we exitted write. Caller must hold freezer->lock. + * + * Task states and freezer state might disagree while tasks are being + * migrated into @cgroup, so we can't verify task states against @freezer + * state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - enum freezer_state old_state = freezer->state; + + if (freezer->state != CGROUP_FREEZING) + return; cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { if (freezing(task)) { - ntotal++; /* * freezer_should_skip() indicates that the task * should be skipped when determining freezing * completion. Consider it frozen in addition to * the usual frozen condition. */ - if (frozen(task) || task_is_stopped_or_traced(task) || - freezer_should_skip(task)) - nfrozen++; + if (!frozen(task) && !task_is_stopped_or_traced(task) && + !freezer_should_skip(task)) + goto notyet; } } - if (old_state == CGROUP_THAWED) { - BUG_ON(nfrozen > 0); - } else if (old_state == CGROUP_FREEZING) { - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - } else { /* old_state == CGROUP_FROZEN */ - BUG_ON(nfrozen != ntotal); - } - + freezer->state = CGROUP_FROZEN; +notyet: cgroup_iter_end(cgroup, &it); } @@ -262,13 +260,8 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); + update_if_frozen(cgroup, freezer); state = freezer->state; - if (state == CGROUP_FREEZING) { - /* We change from FREEZING to FROZEN lazily if the cgroup was - * only partially frozen when we exitted write. */ - update_if_frozen(cgroup, freezer); - state = freezer->state; - } spin_unlock_irq(&freezer->lock); cgroup_unlock(); @@ -306,8 +299,6 @@ static void freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); - update_if_frozen(cgroup, freezer); - switch (goal_state) { case CGROUP_THAWED: if (freezer->state != CGROUP_THAWED) -- cgit v1.2.3 From ead5c473712eb26db792b18a4dc98fdb312883fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:15 -0700 Subject: cgroup_freezer: don't use cgroup_lock_live_group() freezer_read/write() used cgroup_lock_live_group() to synchronize against task migration into and out of the target cgroup. cgroup_lock_live_group() grabs the internal cgroup lock and using it from outside cgroup core leads to complex and fragile locking dependency issues which are difficult to resolve. Now that freezer_can_attach() is replaced with freezer_attach() and update_if_frozen() updated, nothing requires excluding migration against freezer state reads and changes. This patch removes cgroup_lock_live_group() and the matching cgroup_unlock() usages. The prone-to-bitrot, already outdated and unnecessary global lock hierarchy documentation is replaced with documentation in local scope. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 66 ++++++++----------------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3d45503a21a2..8a92b0e52099 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -84,50 +84,6 @@ static const char *freezer_state_strs[] = { struct cgroup_subsys freezer_subsys; -/* Locks taken and their ordering - * ------------------------------ - * cgroup_mutex (AKA cgroup_lock) - * freezer->lock - * css_set_lock - * task->alloc_lock (AKA task_lock) - * task->sighand->siglock - * - * cgroup code forces css_set_lock to be taken before task->alloc_lock - * - * freezer_create(), freezer_destroy(): - * cgroup_mutex [ by cgroup core ] - * - * freezer_can_attach(): - * cgroup_mutex (held by caller of can_attach) - * - * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * freezer->lock - * sighand->siglock (if the cgroup is freezing) - * - * freezer_read(): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * - * freezer_write() (freeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock (fake signal delivery inside freeze_task()) - * - * freezer_write() (unfreeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) - * sighand->siglock - */ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -151,9 +107,13 @@ static void freezer_destroy(struct cgroup *cgroup) } /* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence we don't need - * to worry about racing against migration. + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. */ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { @@ -217,8 +177,8 @@ out: * partially frozen when we exitted write. Caller must hold freezer->lock. * * Task states and freezer state might disagree while tasks are being - * migrated into @cgroup, so we can't verify task states against @freezer - * state here. See freezer_attach() for details. + * migrated into or out of @cgroup, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. */ static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { @@ -255,15 +215,11 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct freezer *freezer; enum freezer_state state; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); - cgroup_unlock(); seq_puts(m, freezer_state_strs[state]); seq_putc(m, '\n'); @@ -297,6 +253,7 @@ static void freezer_change_state(struct cgroup *cgroup, { struct freezer *freezer = cgroup_freezer(cgroup); + /* also synchronizes against task migration, see freezer_attach() */ spin_lock_irq(&freezer->lock); switch (goal_state) { @@ -332,10 +289,7 @@ static int freezer_write(struct cgroup *cgroup, else return -EINVAL; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; freezer_change_state(cgroup, goal_state); - cgroup_unlock(); return 0; } -- cgit v1.2.3 From 5efbe4279f959a3f5ed26adf5f05cb78dd1ffa7e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 23 Oct 2012 01:07:46 +0200 Subject: PM / QoS: Introduce request and constraint data types for PM QoS flags Introduce struct pm_qos_flags_request and struct pm_qos_flags representing PM QoS flags request type and PM QoS flags constraint type, respectively. With these definitions the data structures will be arranged so that the list member of a struct pm_qos_flags object will contain the head of a list of struct pm_qos_flags_request objects representing all of the "flags" requests present for the given device. Then, the effective_flags member of a struct pm_qos_flags object will contain the bitwise OR of the flags members of all the struct pm_qos_flags_request objects in the list. Additionally, introduce helper function pm_qos_update_flags() allowing the caller to manage the list of struct pm_qos_flags_request pointed to by the list member of struct pm_qos_flags. The flags are of type s32 so that the request's "value" field is always of the same type regardless of what kind of request it is (latency requests already have value fields of type s32). Signed-off-by: Rafael J. Wysocki Reviewed-by: Jean Pihet Acked-by: mark gross --- kernel/power/qos.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 846bd42c7ed1..2ab2819aee65 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -212,6 +212,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, } } +/** + * pm_qos_flags_remove_req - Remove device PM QoS flags request. + * @pqf: Device PM QoS flags set to remove the request from. + * @req: Request to remove from the set. + */ +static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req) +{ + s32 val = 0; + + list_del(&req->node); + list_for_each_entry(req, &pqf->list, node) + val |= req->flags; + + pqf->effective_flags = val; +} + +/** + * pm_qos_update_flags - Update a set of PM QoS flags. + * @pqf: Set of flags to update. + * @req: Request to add to the set, to modify, or to remove from the set. + * @action: Action to take on the set. + * @val: Value of the request to add or modify. + * + * Update the given set of PM QoS flags and call notifiers if the aggregate + * value has changed. Returns 1 if the aggregate constraint value has changed, + * 0 otherwise. + */ +bool pm_qos_update_flags(struct pm_qos_flags *pqf, + struct pm_qos_flags_request *req, + enum pm_qos_req_action action, s32 val) +{ + unsigned long irqflags; + s32 prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, irqflags); + + prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + switch (action) { + case PM_QOS_REMOVE_REQ: + pm_qos_flags_remove_req(pqf, req); + break; + case PM_QOS_UPDATE_REQ: + pm_qos_flags_remove_req(pqf, req); + case PM_QOS_ADD_REQ: + req->flags = val; + INIT_LIST_HEAD(&req->node); + list_add_tail(&req->node, &pqf->list); + pqf->effective_flags |= val; + break; + default: + /* no action */ + ; + } + + curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags; + + spin_unlock_irqrestore(&pm_qos_lock, irqflags); + + return prev_value != curr_value; +} + /** * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested -- cgit v1.2.3 From daee779718a319ff9f83e1ba3339334ac650bb22 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sat, 22 Sep 2012 19:52:11 +0200 Subject: console: implement lockdep support for console_lock Dave Airlie recently discovered a locking bug in the fbcon layer, where a timer_del_sync (for the blinking cursor) deadlocks with the timer itself, since both (want to) hold the console_lock: https://lkml.org/lkml/2012/8/21/36 Unfortunately the console_lock isn't a plain mutex and hence has no lockdep support. Which resulted in a few days wasted of tracking down this bug (complicated by the fact that printk doesn't show anything when the console is locked) instead of noticing the bug much earlier with the lockdep splat. Hence I've figured I need to fix that for the next deadlock involving console_lock - and with kms/drm growing ever more complex locking that'll eventually happen. Now the console_lock has rather funky semantics, so after a quick irc discussion with Thomas Gleixner and Dave Airlie I've quickly ditched the original idead of switching to a real mutex (since it won't work) and instead opted to annotate the console_lock with lockdep information manually. There are a few special cases: - The console_lock state is protected by the console_sem, and usually grabbed/dropped at _lock/_unlock time. But the suspend/resume code drops the semaphore without dropping the console_lock (see suspend_console/resume_console). But since the same thread that did the suspend will do the resume, we don't need to fix up anything. - In the printk code there's a special trylock, only used to kick off the logbuffer printk'ing in console_unlock. But all that happens while lockdep is disable (since printk does a few other evil tricks). So no issue there, either. - The console_lock can also be acquired form irq context (but only with a trylock). lockdep already handles that. This all leaves us with annotating the normal console_lock, _unlock and _trylock functions. And yes, it works - simply unloading a drm kms driver resulted in lockdep complaining about the deadlock in fbcon_deinit: ====================================================== [ INFO: possible circular locking dependency detected ] 3.6.0-rc2+ #552 Not tainted ------------------------------------------------------- kms-reload/3577 is trying to acquire lock: ((&info->queue)){+.+...}, at: [] wait_on_work+0x0/0xa7 but task is already holding lock: (console_lock){+.+.+.}, at: [] bind_con_driver+0x38/0x263 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (console_lock){+.+.+.}: [] lock_acquire+0x95/0x105 [] console_lock+0x59/0x5b [] fb_flashcursor+0x2e/0x12c [] process_one_work+0x1d9/0x3b4 [] worker_thread+0x1a7/0x24b [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 -> #0 ((&info->queue)){+.+...}: [] __lock_acquire+0x999/0xcf6 [] lock_acquire+0x95/0x105 [] wait_on_work+0x3b/0xa7 [] __cancel_work_timer+0xbf/0x102 [] cancel_work_sync+0xb/0xd [] fbcon_deinit+0x11c/0x1dc [] bind_con_driver+0x145/0x263 [] unbind_con_driver+0x14f/0x195 [] store_bind+0x1ad/0x1c1 [] dev_attr_store+0x13/0x1f [] sysfs_write_file+0xe9/0x121 [] vfs_write+0x9b/0xfd [] sys_write+0x3e/0x6b [] system_call_fastpath+0x16/0x1b other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(console_lock); lock((&info->queue)); lock(console_lock); lock((&info->queue)); *** DEADLOCK *** v2: Mark the lockdep_map static, noticed by Jani Nikula. Cc: Dave Airlie Cc: Thomas Gleixner Cc: Alan Cox Cc: Peter Zijlstra Signed-off-by: Daniel Vetter Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2d607f4d1797..ee79f14db358 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -87,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1914,6 +1920,7 @@ void console_lock(void) return; console_locked = 1; console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1935,6 +1942,7 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2095,6 +2103,7 @@ skip: local_irq_restore(flags); } console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) -- cgit v1.2.3 From abfd6e58aed4f89fd69b9b17bc4b4527efe3a645 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Sep 2012 16:59:47 -0700 Subject: rcu: Fix comment about _rcu_barrier()/orphanage exclusion In the old days, _rcu_barrier() acquired ->onofflock to exclude rcu_send_cbs_to_orphanage(), which allowed the latter to avoid memory barriers in callback handling. However, _rcu_barrier() recently started doing get_online_cpus() to lock out CPU-hotplug operations entirely, which means that the comment in rcu_send_cbs_to_orphanage() that talks about ->onofflock is now obsolete. This commit therefore fixes the comment. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..ac8aed8ee417 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1581,8 +1581,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, { /* * Orphan the callbacks. First adjust the counts. This is safe - * because ->onofflock excludes _rcu_barrier()'s adoption of - * the callbacks, thus no memory barrier is required. + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. */ if (rdp->nxtlist != NULL) { rsp->qlen_lazy += rdp->qlen_lazy; -- cgit v1.2.3 From 489832609a1ad7189d11715d8cefb457d90182c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Sep 2012 16:08:31 -0700 Subject: rcu: Make rcutorture give diagnostics if CPU offline fails This commit causes rcutorture to print the errno if cpu_down() fails when the rcutorture "verbose" module parameter is specified. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a..9900f560f1bd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1502,6 +1502,7 @@ rcu_torture_onoff(void *arg) unsigned long delta; int maxcpu = -1; DEFINE_RCU_RANDOM(rand); + int ret; unsigned long starttime; VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); @@ -1522,7 +1523,13 @@ rcu_torture_onoff(void *arg) torture_type, cpu); starttime = jiffies; n_offline_attempts++; - if (cpu_down(cpu) == 0) { + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) pr_alert("%s" TORTURE_FLAG "rcu_torture_onoff task: offlined %d\n", -- cgit v1.2.3 From 4d9a5d4319e22670ec6d6227e12b54f361c46d0f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Oct 2012 01:47:16 +0200 Subject: rcu: Remove rcu_switch() It's only there to call rcu_user_hooks_switch(). Let's just call rcu_user_hooks_switch() directly, we don't need this function in the middle. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Peter Zijlstra Cc: Richard Weinberger Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..68414fa4cd2a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1887,7 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch(prev, next); + rcu_user_hooks_switch(prev, next); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From 3705b88db0d7cc4a097c32d9e554054103d3f807 Mon Sep 17 00:00:00 2001 From: Antti P Miettinen Date: Fri, 5 Oct 2012 09:59:15 +0300 Subject: rcu: Add a module parameter to force use of expedited RCU primitives There have been some embedded applications that would benefit from use of expedited grace-period primitives. In some ways, this is similar to synchronize_net() doing either a normal or an expedited grace period depending on lock state, but with control outside of the kernel. This commit therefore adds rcu_expedited boot and sysfs parameters that cause the kernel to substitute expedited primitives for the normal grace-period primitives. [ paulmck: Add trace/event/rcu.h to kernel/srcu.c to avoid build error. Get rid of infinite loop through contention path.] Signed-off-by: Antti P Miettinen Signed-off-by: Paul E. McKenney --- kernel/ksysfs.c | 18 ++++++++++++++++++ kernel/rcu.h | 2 ++ kernel/rcupdate.c | 3 +++ kernel/rcutiny_plugin.h | 5 ++++- kernel/rcutree.c | 12 +++++++++--- kernel/rcutree_plugin.h | 7 +++++-- kernel/srcu.c | 8 +++++++- 7 files changed, 48 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..8715a798aa7c 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -141,6 +141,23 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +int rcu_expedited; +static ssize_t rcu_expedited_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", rcu_expedited); +} +static ssize_t rcu_expedited_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_expedited)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_expedited); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -182,6 +199,7 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif + &rcu_expedited_attr.attr, NULL }; diff --git a/kernel/rcu.h b/kernel/rcu.h index 8ba99cdc6515..20dfba576c2b 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -109,4 +109,6 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) } } +extern int rcu_expedited; + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 29ca1c6da594..a2cf76177b44 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,12 +46,15 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include #include "rcu.h" +module_param(rcu_expedited, int, 0); + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3d0190282204..f85016a2309b 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -706,7 +706,10 @@ void synchronize_rcu(void) return; /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - rcu_barrier(); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + rcu_barrier(); } EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..f9c17c399538 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2224,7 +2224,10 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_sched); + if (rcu_expedited) + synchronize_sched_expedited(); + else + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -2245,7 +2248,10 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - wait_rcu_gp(call_rcu_bh); + if (rcu_expedited) + synchronize_rcu_bh_expedited(); + else + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -2328,7 +2334,7 @@ void synchronize_sched_expedited(void) if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - synchronize_sched(); + wait_rcu_gp(call_rcu_sched); return; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f92115488187..c177ba0cce9a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -679,7 +679,10 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - wait_rcu_gp(call_rcu); + if (rcu_expedited) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -831,7 +834,7 @@ void synchronize_rcu_expedited(void) udelay(trycount * num_online_cpus()); } else { put_online_cpus(); - synchronize_rcu(); + wait_rcu_gp(call_rcu); return; } } diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd844..de9074047c92 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -34,6 +34,10 @@ #include #include +#include + +#include "rcu.h" + /* * Initialize an rcu_batch structure to empty. */ @@ -464,7 +468,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); + __synchronize_srcu(sp, rcu_expedited + ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT + : SYNCHRONIZE_SRCU_TRYCOUNT); } EXPORT_SYMBOL_GPL(synchronize_srcu); -- cgit v1.2.3 From 340f588bbaed6cb518aa65e7a330dcc3fff911f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Oct 2012 08:24:54 -0700 Subject: rcu: Fix precedence error in cpu_needs_another_gp() The fix introduced by a10d206e (rcu: Fix day-one dyntick-idle stall-warning bug) has a C-language precedence error. It turns out that this error is harmless in that the same result is computed for all inputs, but the code is nevertheless a potential source of confusion. This commit therefore introduces parentheses in order to force the execution of the code to reflect the intent. Reported-by: Ben Hutchings Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f9c17c399538..effd47a54b36 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -313,7 +313,7 @@ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { return *rdp->nxttail[RCU_DONE_TAIL + - ACCESS_ONCE(rsp->completed) != rdp->completed] && + (ACCESS_ONCE(rsp->completed) != rdp->completed)] && !rcu_gp_in_progress(rsp); } -- cgit v1.2.3 From 4e87b2d7e887df3fe251dd7f702591a3acf369ca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:14 +0800 Subject: srcu: Credit Lai Jiangshan with SRCU rewrite Lai Jiangshan rewrote SRCU, so this commit ensures that he gets his proper share of blame^Wcredit. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 97c465ebd844..0b99f27fa2f7 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -16,8 +16,10 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney + * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt -- cgit v1.2.3 From f2ebfbc991044fd5b89d4529741d7500feb37fbd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:15 +0800 Subject: srcu: Export process_srcu() Because process_srcu() will be used in DEFINE_SRCU(), which is a macro that could be expanded pretty much anywhere, it can no longer be static. Note that process_srcu() is still internal to srcu.h. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/srcu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/srcu.c b/kernel/srcu.c index 0b99f27fa2f7..b363b0924561 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -94,9 +94,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) } } -/* single-thread state-machine */ -static void process_srcu(struct work_struct *work); - static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->completed = 0; @@ -639,7 +636,7 @@ static void srcu_reschedule(struct srcu_struct *sp) /* * This is the work-queue function that handles SRCU grace periods. */ -static void process_srcu(struct work_struct *work) +void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -650,3 +647,4 @@ static void process_srcu(struct work_struct *work) srcu_invoke_callbacks(sp); srcu_reschedule(sp); } +EXPORT_SYMBOL_GPL(process_srcu); -- cgit v1.2.3 From b637a328bd4f43a0e146d1eef0142b650ba0d644 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Sep 2012 16:58:38 -0700 Subject: rcu: Print remote CPU's stacks in stall warnings The RCU CPU stall warnings rely on trigger_all_cpu_backtrace() to do NMI-based dump of the stack traces of all CPUs. Unfortunately, a number of architectures do not implement trigger_all_cpu_backtrace(), in which case RCU falls back to just dumping the stack of the running CPU. This is unhelpful in the case where the running CPU has detected that some other CPU has stalled. This commit therefore makes the running CPU dump the stacks of the tasks running on the stalled CPUs. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 25 ++++++++++++++++++++++++- kernel/sched/core.c | 6 ++++++ 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd9204..e78538712df0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -873,6 +873,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } +/* + * Dump stacks of all tasks running on stalled CPUs. This is a fallback + * for architectures that do not implement trigger_all_cpu_backtrace(). + * The NMI-triggered stack traces are more accurate because they are + * printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + dump_cpu_task(rnp->grplo + cpu); + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -929,7 +952,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..59d08fb1a9e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8076,3 +8076,9 @@ struct cgroup_subsys cpuacct_subsys = { .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} -- cgit v1.2.3 From eee058826100e5a344f11601e0c47baeaf07c77b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2012 14:15:05 -0700 Subject: rcu: Add grace-period information to RCU CPU stall warnings This commit causes the last grace period started and completed to be printed on RCU CPU stall warning messages in order to aid diagnosis. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e78538712df0..8f4de3a7c1f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -947,8 +947,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); - printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start)); + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start), + rsp->gpnum, rsp->completed); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -975,7 +976,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); - printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); + pr_cont(" (t=%lu jiffies g=%lu c=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed); if (!trigger_all_cpu_backtrace()) dump_stack(); -- cgit v1.2.3 From 53bb857c373d6b7936f8a7b4451f0a99703c308e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Sep 2012 16:35:25 -0700 Subject: rcu: Dump number of callbacks in stall warning messages In theory, if a grace period manages to get started despite there being no callbacks on any of the CPUs, all CPUs could go into dyntick-idle mode, so that the grace period would never end. This commit updates the RCU CPU stall warning messages to detect this condition by summing up the number of callbacks on all CPUs. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8f4de3a7c1f7..24b21cba2cc8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -903,6 +903,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) unsigned long flags; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* Only let one CPU complain about others per time interval. */ @@ -947,9 +948,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); print_cpu_stall_info_end(); - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu)\n", + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed); + rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -964,8 +967,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) static void print_cpu_stall(struct rcu_state *rsp) { + int cpu; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; /* * OK, time to rat on ourselves... @@ -976,8 +981,10 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); - pr_cont(" (t=%lu jiffies g=%lu c=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); -- cgit v1.2.3 From 6b898c07cb1d5bd8344a8044288bb4ae3873da74 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 17 Sep 2012 23:03:31 +0000 Subject: console: use might_sleep in console_lock Instead of BUG_ON(in_interrupt()), since that doesn't check for all the newfangled stuff like preempt. Note that this is valid since the console_sem is essentially used like a real mutex with only two twists: - we allow trylock from hardirq context - across suspend/resume we lock the logical console_lock, but drop the semaphore protecting the locking state. Now that doesn't guarantee that no one is playing tricks in single-thread atomic contexts at suspend/resume/boot time, but - I couldn't find anything suspicious with some grepping, - might_sleep shouldn't die, - and I think the upside of catching more potential issues is worth the risk of getting a might_sleep backtrace that would have been save (and then dealing with that fallout). Cc: Dave Airlie Cc: Thomas Gleixner Cc: Alan Cox Cc: Peter Zijlstra Signed-off-by: Daniel Vetter Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ee79f14db358..22e070f3470a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1914,7 +1914,8 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, */ void console_lock(void) { - BUG_ON(in_interrupt()); + might_sleep(); + down(&console_sem); if (console_suspended) return; -- cgit v1.2.3 From 351f181f9134d71efd46ddf0c0abca31b58cd79b Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 25 Oct 2012 01:07:35 +0800 Subject: timers, sched: Correct the comments for tick_sched_timer() In the comments of function tick_sched_timer(), the sentence "timer->base->cpu_base->lock held" is not right. In function __run_hrtimer(), before call timer->function(), the cpu_base->lock has been unlocked. Signed-off-by: liu chuansheng Cc: fei.li@intel.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1351098455.15558.1421.camel@cliu38-desktop-build Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..2bc73d3bf7fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -794,7 +794,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { -- cgit v1.2.3 From 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:29 +0200 Subject: sched: Track the runnable average on a per-task entity basis Instead of tracking averaging the load parented by a cfs_rq, we can track entity load directly. With the load for a given cfs_rq then being the sum of its children. To do this we represent the historical contribution to runnable average within each trailing 1024us of execution as the coefficients of a geometric series. We can express this for a given task t as: runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t) Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which roughly translates to about a sched period. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.372695337@google.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 ++ kernel/sched/debug.c | 4 ++ kernel/sched/fair.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..fd9d0859350a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1524,6 +1524,11 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_SMP + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea9..61f70979153a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); +#endif #undef PN #undef P } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..16d67f9b6955 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -971,6 +971,126 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_SMP +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + for (; n && val; n--) { + val *= 4008; + val >>= 12; + } + + return val; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable) +{ + u64 delta; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + BUG_ON(delta_w > delta); + do { + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + /* + * Remainder of delta initiates a new period, roll over + * the previous. + */ + sa->runnable_avg_sum = + decay_load(sa->runnable_avg_sum, 1); + sa->runnable_avg_period = + decay_load(sa->runnable_avg_period, 1); + + delta -= delta_w; + /* New period is empty */ + delta_w = 1024; + } while (delta >= 1024); + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se) +{ + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, + se->on_rq); +} +#else +static inline void update_entity_load_avg(struct sched_entity *se) {} +#endif + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -1097,6 +1217,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); + update_entity_load_avg(se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1171,6 +1292,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_entity_load_avg(se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1340,6 +1462,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev); } cfs_rq->curr = NULL; } @@ -1352,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Ensure that runnable average is periodically updated. + */ + update_entity_load_avg(curr); + /* * Update share accounting for long-running entities. */ -- cgit v1.2.3 From 18bf2805d9b30cb823d4919b42cd230f59c7ce1f Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 4 Oct 2012 12:51:20 +0200 Subject: sched: Maintain per-rq runnable averages Since runqueues do not have a corresponding sched_entity we instead embed a sched_avg structure directly. Signed-off-by: Ben Segall Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.442637130@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 10 ++++++++-- kernel/sched/fair.c | 18 ++++++++++++++++-- kernel/sched/sched.h | 2 ++ 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 61f70979153a..4240abce4116 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec) static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; - if (!se) - return; #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); + return; + } + + PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16d67f9b6955..8c5468fcf10d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1087,8 +1087,14 @@ static inline void update_entity_load_avg(struct sched_entity *se) __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, se->on_rq); } + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2340,8 +2346,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); inc_nr_running(rq); + } hrtick_update(rq); } @@ -2399,8 +2407,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { dec_nr_running(rq); + update_rq_runnable_avg(rq, 1); + } hrtick_update(rq); } @@ -4586,6 +4596,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; + update_rq_runnable_avg(this_rq, 1); + /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5083,6 +5095,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + update_rq_runnable_avg(rq, 1); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..14b571968713 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -467,6 +467,8 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + + struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) -- cgit v1.2.3 From 2dac754e10a5d41d94d2d2365c0345d4f215a266 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Aggregate load contributed by task entities on parenting cfs_rq For a given task t, we can compute its contribution to load as: task_load(t) = runnable_avg(t) * weight(t) On a parenting cfs_rq we can then aggregate: runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t Maintain this bottom up, with task entities adding their contributed load to the parenting cfs_rq sum. When a task entity's load changes we add the same delta to the maintained sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.514678907@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 3 +++ kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 59 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4240abce4116..c953a89f94aa 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); #endif #undef PN #undef P @@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->load_contribution); SEQ_printf(m, " .%-30s: %d\n", "load_tg", atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5468fcf10d..77af759e5675 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1081,20 +1081,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (!entity_is_task(se)) + return 0; + + se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * + se->load.weight, + se->avg.runnable_avg_period + 1); + + return se->avg.load_avg_contrib - old_contrib; +} + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se) { - __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, - se->on_rq); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + + if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, + se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); } + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +} + +/* Remove se's load from this cfs_rq child load-average */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1223,7 +1266,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_entity_load_avg(se); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1298,7 +1341,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_entity_load_avg(se); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14b571968713..e6539736af58 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -222,6 +222,15 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + u64 runnable_load_avg; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -1214,4 +1223,3 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -- cgit v1.2.3 From 9ee474f55664ff63111c843099d365e7ecffb56f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Maintain the load contribution of blocked entities We are currently maintaining: runnable_load(cfs_rq) = \Sum task_load(t) For all running children t of cfs_rq. While this can be naturally updated for tasks in a runnable state (as they are scheduled); this does not account for the load contributed by blocked task entities. This can be solved by introducing a separate accounting for blocked load: blocked_load(cfs_rq) = \Sum runnable(b) * weight(b) Obviously we do not want to iterate over all blocked entities to account for their decay, we instead observe that: runnable_load(t) = \Sum p_i*y^i and that to account for an additional idle period we only need to compute: y*runnable_load(t). This means that we can compute all blocked entities at once by evaluating: blocked_load(cfs_rq)` = y * blocked_load(cfs_rq) Finally we maintain a decay counter so that when a sleeping entity re-awakens we can determine how much of its load should be removed from the blocked sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/debug.c | 3 ++ kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 4 +- 4 files changed, 121 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd9d0859350a..00898f1fb69e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif - #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c953a89f94aa..2d2e2b3c1bef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); P(se->avg.load_avg_contrib); + P(se->avg.decay_count); #endif #undef PN #undef P @@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77af759e5675..83194175e841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq); } } @@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline void __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + /* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se) +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; @@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se) return; contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + if (se->on_rq) cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +{ + u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays) + return; + + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + + cfs_rq->last_decay = now; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int wakeup) { - update_entity_load_avg(se); + /* we track migrations using entity decay_count == 0 */ + if (unlikely(!se->avg.decay_count)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + if (wakeup) + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + + update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + update_cfs_rq_blocked_load(cfs_rq); } -/* Remove se's load from this cfs_rq child load-average */ +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int sleep) { - update_entity_load_avg(se); + update_entity_load_avg(se, 1); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } #else -static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int wakeup) {} static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev); + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * Update share accounting for long-running entities. @@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * We need to update shares after updating tg->load_weight in @@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6539736af58..664ff39195f7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -229,7 +229,9 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg; + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.2.3 From 0a74bef8bed18dc6889e9bc37ea1050a50c86c89 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Add an rq migration call-back to sched_class Since we are now doing bottom up load accumulation we need explicit notification when a task has been re-parented so that the old hierarchy can be updated. Adds: migrate_task_rq(struct task_struct *p, int next_cpu) (The alternative is to do this out of __set_task_cpu, but it was suggested that this would be a cleaner encapsulation.) Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.660023400@google.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00898f1fb69e..f26860074ef2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83194175e841..5e602e6ba0c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3047,6 +3047,17 @@ unlock: return new_cpu; } + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +} #endif /* CONFIG_SMP */ static unsigned long @@ -5607,6 +5618,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, -- cgit v1.2.3 From aff3e498844441fa71c5ee1bbc470e1dff9548d9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Account for blocked load waking back up When a running entity blocks we migrate its tracked load to cfs_rq->blocked_runnable_avg. In the sleep case this occurs while holding rq->lock and so is a natural transition. Wake-ups however, are potentially asynchronous in the presence of migration and so special care must be taken. We use an atomic counter to track such migrated load, taking care to match this with the previously introduced decay counters so that we don't migrate too much load. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.726077467@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 100 ++++++++++++++++++++++++++++++++++++++++----------- kernel/sched/sched.h | 2 +- 2 files changed, 81 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e602e6ba0c3..74dc29ba1ad1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,7 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update); static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { @@ -281,7 +282,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->on_list = 1; /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -1086,17 +1087,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now, } /* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline void __synchronize_entity_decay(struct sched_entity *se) +static inline u64 __synchronize_entity_decay(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; if (!decays) - return; + return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); se->avg.decay_count = 0; + + return decays; } /* Compute the current contribution to load_avg by se, return any delta */ @@ -1149,20 +1152,26 @@ static inline void update_entity_load_avg(struct sched_entity *se, * Decay the load contributed by all blocked children and account this so that * their contribution may appropriately discounted when they wake up. */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) { u64 now = rq_of(cfs_rq)->clock_task >> 20; u64 decays; decays = now - cfs_rq->last_decay; - if (!decays) + if (!decays && !force_update) return; - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); + if (atomic64_read(&cfs_rq->removed_load)) { + u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } - cfs_rq->last_decay = now; + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1175,20 +1184,42 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { - /* we track migrations using entity decay_count == 0 */ - if (unlikely(!se->avg.decay_count)) { + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + */ + if (unlikely(se->avg.decay_count <= 0)) { se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; + } wakeup = 0; } else { __synchronize_entity_decay(se); } - if (wakeup) + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } - update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - update_cfs_rq_blocked_load(cfs_rq); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); } /* @@ -1201,6 +1232,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) { update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; if (sleep) { @@ -1218,7 +1251,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1610,7 +1644,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 1); /* * Update share accounting for long-running entities. @@ -3057,6 +3091,19 @@ unlock: static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + } } #endif /* CONFIG_SMP */ @@ -3593,7 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); - update_cfs_rq_blocked_load(cfs_rq); + update_cfs_rq_blocked_load(cfs_rq, 1); /* * We need to update shares after updating tg->load_weight in @@ -5390,12 +5437,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) atomic64_set(&cfs_rq->decay_counter, 1); + atomic64_set(&cfs_rq->removed_load, 0); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct cfs_rq *cfs_rq; /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5427,8 +5476,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); - if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } } void free_fair_sched_group(struct task_group *tg) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 664ff39195f7..30236ab4edc0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -230,7 +230,7 @@ struct cfs_rq { * the FAIR_GROUP_SCHED case). */ u64 runnable_load_avg, blocked_load_avg; - atomic64_t decay_counter; + atomic64_t decay_counter, removed_load; u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From c566e8e9e44b72b53091da20e2dedefc730f2ee2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Aggregate total task_group load Maintain a global running sum of the average load seen on each cfs_rq belonging to each task group so that it may be used in calculating an appropriate shares:weight distribution. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.792901086@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 22 ++++++++++++++++++++++ kernel/sched/sched.h | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2d2e2b3c1bef..290892361a09 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -230,6 +230,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 74dc29ba1ad1..db788222f198 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1102,6 +1102,26 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return decays; } +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + s64 tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic64_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +#endif + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1172,6 +1192,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) atomic64_add(decays, &cfs_rq->decay_counter); cfs_rq->last_decay = now; } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30236ab4edc0..924a99094888 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,7 @@ struct task_group { unsigned long shares; atomic_t load_weight; + atomic64_t load_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -232,6 +233,9 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; +#ifdef CONFIG_FAIR_GROUP_SCHED + u64 tg_load_contrib; +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.2.3 From 8165e145ceb62fc338e099c9b12b3239c83d2f8e Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Compute load contribution by a group entity Unlike task entities who have a fixed weight, group entities instead own a fraction of their parenting task_group's shares as their contributed weight. Compute this fraction so that we can correctly account hierarchies and shared entity nodes. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.855074415@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db788222f198..e20cb2693ef7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1117,22 +1117,43 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, cfs_rq->tg_load_contrib += tg_contrib; } } + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div64_u64(contrib, + atomic64_read(&tg->load_avg) + 1); +} #else static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} #endif +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { long old_contrib = se->avg.load_avg_contrib; - if (!entity_is_task(se)) - return 0; - - se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * - se->load.weight, - se->avg.runnable_avg_period + 1); + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_group_entity_contrib(se); + } return se->avg.load_avg_contrib - old_contrib; } -- cgit v1.2.3 From bb17f65571e97a7ec0297571fb1154fbd107ad00 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Normalize tg load contributions against runnable time Entities of equal weight should receive equitable distribution of cpu time. This is challenging in the case of a task_group's shares as execution may be occurring on multiple cpus simultaneously. To handle this we divide up the shares into weights proportionate with the load on each cfs_rq. This does not however, account for the fact that the sum of the parts may be less than one cpu and so we need to normalize: load(tg) = min(runnable_avg(tg), 1) * tg->shares Where runnable_avg is the aggregate time in which the task_group had runnable children. Signed-off-by: Paul Turner Reviewed-by: Ben Segall . Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++++ kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 290892361a09..71b0ea325e93 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic64_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e20cb2693ef7..9e49722da032 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, } } +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + } +} + static inline void __update_group_entity_contrib(struct sched_entity *se) { struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg = cfs_rq->tg; + int runnable_avg; + u64 contrib; contrib = cfs_rq->tg_load_contrib * tg->shares; se->avg.load_avg_contrib = div64_u64(contrib, atomic64_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } } #else static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} #endif @@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) if (entity_is_task(se)) { __update_task_entity_contrib(se); } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); __update_group_entity_contrib(se); } @@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); } /* Add the load generated by se into cfs_rq's child load-average */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 924a99094888..134928dc6f05 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -113,6 +113,7 @@ struct task_group { atomic_t load_weight; atomic64_t load_avg; + atomic_t runnable_avg; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -234,6 +235,7 @@ struct cfs_rq { atomic64_t decay_counter, removed_load; u64 last_decay; #ifdef CONFIG_FAIR_GROUP_SCHED + u32 tg_runnable_contrib; u64 tg_load_contrib; #endif #endif -- cgit v1.2.3 From f1b17280efbd21873d1db8631117bdbccbcb39a2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Maintain runnable averages across throttled periods With bandwidth control tracked entities may cease execution according to user specified bandwidth limits. Charging this time as either throttled or blocked however, is incorrect and would falsely skew in either direction. What we actually want is for any throttled periods to be "invisible" to load-tracking as they are removed from the system for that interval and contribute normally otherwise. Do this by moderating the progression of time to omit any periods in which the entity belonged to a throttled hierarchy. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.998912151@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 3 ++- 2 files changed, 42 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e49722da032..873c9f5c5796 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1222,15 +1222,26 @@ static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, cfs_rq->blocked_load_avg = 0; } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; + u64 now; - if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, - se->on_rq)) + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) return; contrib_delta = __update_entity_load_avg_contrib(se); @@ -1250,7 +1261,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, */ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) { - u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; u64 decays; decays = now - cfs_rq->last_decay; @@ -1841,6 +1852,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return &tg->cfs_bandwidth; } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1991,6 +2011,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->load_stamp += delta; cfs_rq->load_last += delta; + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; + /* update entity weight now that we are on_rq again */ update_cfs_shares(cfs_rq); } @@ -2005,8 +2029,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, record last load */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { update_cfs_load(cfs_rq, 0); + cfs_rq->throttled_clock_task = rq->clock_task; + } cfs_rq->throttle_count++; return 0; @@ -2021,7 +2047,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* account load preceding throttle */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); @@ -2045,7 +2071,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_timestamp = rq->clock; + cfs_rq->throttled_clock = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2063,10 +2089,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 0; raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - cfs_rq->throttled_timestamp = 0; update_rq_clock(rq); /* update hierarchical throttle state */ @@ -2466,8 +2491,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 134928dc6f05..d13bce7a44ef 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -281,7 +281,8 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; - u64 throttled_timestamp; + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.2.3 From 82958366cfea1a50e7e90907b2d55ae29ed69974 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Replace update_shares weight distribution with per-entity computation Now that the machinery in place is in place to compute contributed load in a bottom up fashion; replace the shares distribution code within update_shares() accordingly. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.061208672@google.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 --- kernel/sched/fair.c | 157 ++++++++------------------------------------------- kernel/sched/sched.h | 36 ++++-------- 3 files changed, 36 insertions(+), 165 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 71b0ea325e93..2cd3c1b4e582 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", - SPLIT_NS(cfs_rq->load_avg)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", - SPLIT_NS(cfs_rq->load_period)); - SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", - cfs_rq->load_contribution); - SEQ_printf(m, " .%-30s: %d\n", "load_tg", - atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 873c9f5c5796..57fae95eed99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -658,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -680,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED - cfs_rq->load_unacc_exec_time += delta_exec; -#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, - int global_update) -{ - struct task_group *tg = cfs_rq->tg; - long load_avg; - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - } -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ - u64 period = sysctl_sched_shares_window; - u64 now, delta; - unsigned long load = cfs_rq->load.weight; - - if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) - return; - - now = rq_of(cfs_rq)->clock_task; - delta = now - cfs_rq->load_stamp; - - /* truncate load history at 4 idle periods */ - if (cfs_rq->load_stamp > cfs_rq->load_last && - now - cfs_rq->load_last > 4 * period) { - cfs_rq->load_period = 0; - cfs_rq->load_avg = 0; - delta = period - 1; - } - - cfs_rq->load_stamp = now; - cfs_rq->load_unacc_exec_time = 0; - cfs_rq->load_period += delta; - if (load) { - cfs_rq->load_last = now; - cfs_rq->load_avg += delta * load; - } - - /* consider updating load contribution on each fold or truncate */ - if (global_update || cfs_rq->load_period > period - || !cfs_rq->load_period) - update_cfs_rq_load_contribution(cfs_rq, global_update); - - while (cfs_rq->load_period > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (cfs_rq->load_period)); - cfs_rq->load_period /= 2; - cfs_rq->load_avg /= 2; - } - - if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) - list_del_leaf_cfs_rq(cfs_rq); -} - static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) { long tg_weight; @@ -881,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic_read(&tg->load_weight); - tg_weight -= cfs_rq->load_contribution; + tg_weight = atomic64_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -906,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return shares; } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); - } -} # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} # endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) @@ -944,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; @@ -963,17 +877,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} - static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -} #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -1490,7 +1396,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1587,7 +1492,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); /* @@ -1756,11 +1660,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_entity_load_avg(curr, 1); update_cfs_rq_blocked_load(cfs_rq, 1); - /* - * Update share accounting for long-running entities. - */ - update_entity_shares_tick(cfs_rq); - #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -2005,18 +1904,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { - u64 delta = rq->clock_task - cfs_rq->load_stamp; - - /* leaving throttled state, advance shares averaging windows */ - cfs_rq->load_stamp += delta; - cfs_rq->load_last += delta; - /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq->clock_task - cfs_rq->throttled_clock_task; - - /* update entity weight now that we are on_rq again */ - update_cfs_shares(cfs_rq); } #endif @@ -2028,11 +1918,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, record last load */ - if (!cfs_rq->throttle_count) { - update_cfs_load(cfs_rq, 0); + /* group is entering throttled state, stop time */ + if (!cfs_rq->throttle_count) cfs_rq->throttled_clock_task = rq->clock_task; - } cfs_rq->throttle_count++; return 0; @@ -2630,7 +2518,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); } @@ -2692,7 +2579,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); } @@ -3755,27 +3641,36 @@ next: */ static int update_shares_cpu(struct task_group *tg, int cpu) { + struct sched_entity *se; struct cfs_rq *cfs_rq; unsigned long flags; struct rq *rq; - if (!tg->se[cpu]) - return 0; - rq = cpu_rq(cpu); + se = tg->se[cpu]; cfs_rq = tg->cfs_rq[cpu]; raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); update_cfs_rq_blocked_load(cfs_rq, 1); - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); + if (se) { + update_entity_load_avg(se, 1); + /* + * We pivot on our runnable average having decayed to zero for + * list removal. This generally implies that all our children + * have also been removed (modulo rounding error or bandwidth + * control); however, such cases are rare and we can fix these + * at enqueue. + * + * TODO: fix up out-of-order children on enqueue. + */ + if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) + list_del_leaf_cfs_rq(cfs_rq); + } else { + update_rq_runnable_avg(rq, rq->nr_running); + } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5702,10 +5597,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->tg = tg; cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d13bce7a44ef..0a75a430ca77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -234,11 +234,21 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; + #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -254,28 +264,6 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ -#ifdef CONFIG_SMP - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; -- cgit v1.2.3 From 48a1675323fa1b7844e479ad2a4469f4558c0f79 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Refactor update_shares_cpu() -> update_blocked_avgs() Now that running entities maintain their own load-averages the work we must do in update_shares() is largely restricted to the periodic decay of blocked entities. This allows us to be a little less pessimistic regarding our occupancy on rq->lock and the associated rq->clock updates required. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.133999170@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57fae95eed99..dcc27d8ae6ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3639,20 +3639,15 @@ next: /* * update tg->load_weight by folding this cpu's load_avg */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) { - struct sched_entity *se; - struct cfs_rq *cfs_rq; - unsigned long flags; - struct rq *rq; - - rq = cpu_rq(cpu); - se = tg->se[cpu]; - cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - raw_spin_lock_irqsave(&rq->lock, flags); + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; - update_rq_clock(rq); update_cfs_rq_blocked_load(cfs_rq, 1); if (se) { @@ -3669,32 +3664,33 @@ static int update_shares_cpu(struct task_group *tg, int cpu) if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) list_del_leaf_cfs_rq(cfs_rq); } else { + struct rq *rq = rq_of(cfs_rq); update_rq_runnable_avg(rq, rq->nr_running); } - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return 0; } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu) { - struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; - rcu_read_lock(); + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - update_shares_cpu(cfs_rq->tg, cpu); + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); } - rcu_read_unlock(); + + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -3746,7 +3742,7 @@ static unsigned long task_h_load(struct task_struct *p) return load; } #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu) { } @@ -4813,7 +4809,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); - update_shares(this_cpu); + update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; @@ -5068,7 +5064,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; - update_shares(cpu); + update_blocked_averages(cpu); rcu_read_lock(); for_each_domain(cpu, sd) { -- cgit v1.2.3 From f269ae0469fc882332bdfb5db15d3c1315fe2a10 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:31 +0200 Subject: sched: Update_cfs_shares at period edge Now that our measurement intervals are small (~1ms) we can amortize the posting of update_shares() to be about each period overflow. This is a large cost saving for frequently switching tasks. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.200772172@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dcc27d8ae6ba..002a7697f437 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1187,6 +1187,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) } __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + update_cfs_shares(cfs_rq); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1396,9 +1397,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1471,7 +1471,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1491,8 +1490,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); - se->on_rq = 0; account_entity_dequeue(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1506,7 +1505,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + se->on_rq = 0; } /* @@ -2518,8 +2517,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -2579,8 +2578,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); + update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -5639,8 +5638,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) + for_each_sched_entity(se) { update_cfs_shares(group_cfs_rq(se)); + /* update contribution to parent */ + update_entity_load_avg(se, 1); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From 5b51f2f80b3b906ce59bd4dce6eca3c7f34cb1b9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: sched: Make __update_entity_runnable_avg() fast __update_entity_runnable_avg forms the core of maintaining an entity's runnable load average. In this function we charge the accumulated run-time since last update and handle appropriate decay. In some cases, e.g. a waking task, this time interval may be much larger than our period unit. Fortunately we can exploit some properties of our series to perform decay for a blocked update in constant time and account the contribution for a running update in essentially-constant* time. [*]: For any running entity they should be performing updates at the tick which gives us a soft limit of 1 jiffy between updates, and we can compute up to a 32 jiffy update in a single pass. C program to generate the magic constants in the arrays: #include #include #define N 32 #define WMULT_SHIFT 32 const long WMULT_CONST = ((1UL << N) - 1); double y; long runnable_avg_yN_inv[N]; void calc_mult_inv() { int i; double yn = 0; printf("inverses\n"); for (i = 0; i < N; i++) { yn = (double)WMULT_CONST * pow(y, i); runnable_avg_yN_inv[i] = yn; printf("%2d: 0x%8lx\n", i, runnable_avg_yN_inv[i]); } printf("\n"); } long mult_inv(long c, int n) { return (c * runnable_avg_yN_inv[n]) >> WMULT_SHIFT; } void calc_yn_sum(int n) { int i; double sum = 0, sum_fl = 0, diff = 0; /* * We take the floored sum to ensure the sum of partial sums is never * larger than the actual sum. */ printf("sum y^n\n"); printf(" %8s %8s %8s\n", "exact", "floor", "error"); for (i = 1; i <= n; i++) { sum = (y * sum + y * 1024); sum_fl = floor(y * sum_fl+ y * 1024); printf("%2d: %8.0f %8.0f %8.0f\n", i, sum, sum_fl, sum_fl - sum); } printf("\n"); } void calc_conv(long n) { long old_n; int i = -1; printf("convergence (LOAD_AVG_MAX, LOAD_AVG_MAX_N)\n"); do { old_n = n; n = mult_inv(n, 1) + 1024; i++; } while (n != old_n); printf("%d> %ld\n", i - 1, n); printf("\n"); } void main() { y = pow(0.5, 1/(double)N); calc_mult_inv(); calc_conv(1024); calc_yn_sum(N); } [ Compile with -lm ] Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.277808946@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 125 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 002a7697f437..6ecf455fd95b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -883,18 +883,93 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ static __always_inline u64 decay_load(u64 val, u64 n) { - for (; n && val; n--) { - val *= 4008; - val >>= 12; + unsigned int local_n; + + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; } - return val; + val *= runnable_avg_yN_inv[local_n]; + /* We don't use SRR here since we always want to round down. */ + return val >> 32; +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n = LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; } /* @@ -929,7 +1004,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, int runnable) { - u64 delta; + u64 delta, periods; + u32 runnable_contrib; int delta_w, decayed = 0; delta = now - sa->last_runnable_update; @@ -963,25 +1039,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, * period and accrue it. */ delta_w = 1024 - delta_w; - BUG_ON(delta_w > delta); - do { - if (runnable) - sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; - - /* - * Remainder of delta initiates a new period, roll over - * the previous. - */ - sa->runnable_avg_sum = - decay_load(sa->runnable_avg_sum, 1); - sa->runnable_avg_period = - decay_load(sa->runnable_avg_period, 1); - - delta -= delta_w; - /* New period is empty */ - delta_w = 1024; - } while (delta >= 1024); + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + sa->runnable_avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ -- cgit v1.2.3 From f4e26b120b9de84cb627bc7361ba43cfdc51341f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking While per-entity load-tracking is generally useful, beyond computing shares distribution, e.g. runnable based load-balance (in progress), governors, power-management, etc. These facilities are not yet consumers of this data. This may be trivially reverted when the information is required; but avoid paying the overhead for calculations we will not use until then. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.422162369@google.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 13 +++++++++++-- kernel/sched/sched.h | 9 ++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f26860074ef2..5dae0d252ff7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1526,7 +1526,12 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ecf455fd95b..3e6a3531fa90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3173,6 +3174,12 @@ unlock: return new_cpu; } +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3196,6 +3203,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5773,8 +5781,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, +#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, - +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0a75a430ca77..5eca173b563f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -225,6 +225,12 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -234,7 +240,8 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; - +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -- cgit v1.2.3 From e9c84cb8d5f1b1ea6fcbe6190d51dc84b6975938 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Jul 2012 13:53:26 +0200 Subject: sched: Describe CFS load-balancer Add some scribbles on how and why the load-balancer works.. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1341316406.23484.64.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e6a3531fa90..a319d56c7605 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3456,8 +3456,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp #ifdef CONFIG_SMP /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + * W_i,n/P_i == W_j,n/P_j for all i,j (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + * W_i,0 = \Sum_j w_i,j (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + * - infeasible weights; + * - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + * log_2 n 1 n + * \Sum { --- * --- * 2^i } = O(n) (5) + * i = 0 2^i 2^i + * `- size of each group + * | | `- number of cpus doing load-balance + * | `- freq + * `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + * log_2 n + * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) + * k = 0 + * + * And you'll find that: + * + * A^(log_2 n)_i,j != 0 for all i,j (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + * O(nm log n), n := nr_cpus, m := nr_tasks (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + * s_k,i + * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) + * S_k + * + * Where + * + * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + * rewrite all of this once again.] + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; -- cgit v1.2.3 From 99fb4a122e96203dfd6c67d99d908aafd20f4753 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 20 Oct 2012 23:05:19 +0400 Subject: lockdep: Use KSYM_NAME_LEN'ed buffer for __get_key_name() Not a big deal, but since other __get_key_name() callers use it lets be consistent. Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20121020190519.GH25467@moon Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 91c32a0b612c..b2c71c5873e4 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v) static void print_name(struct seq_file *m, struct lock_class *class) { - char str[128]; + char str[KSYM_NAME_LEN]; const char *name = class->name; if (!name) { -- cgit v1.2.3 From 8ae763cd7e88a6bc552a6615ba6c1dcaa4828cbf Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 16 Oct 2012 11:53:44 +0100 Subject: audit: remove bogus tty name check tty name is an array not a pointer Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2f186ed80c40..fc7376bf86ea 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1159,7 +1159,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) cred = current_cred(); spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + if (tsk->signal && tsk->signal->tty) tty = tsk->signal->tty->name; else tty = "(none)"; -- cgit v1.2.3 From 0d13ac96b9c38e3e5434c93990e4bbf0939ab199 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 17:51:26 +0800 Subject: uprobes: Fix misleading log entry There don't have any 'r' prefix in uprobe event naming, remove it. Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..f3c3811f7e1c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -189,7 +189,7 @@ static int create_trace_uprobe(int argc, char **argv) if (argv[0][0] == '-') is_delete = true; else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); + pr_info("Probe definition must be started with 'p' or '-'.\n"); return -EINVAL; } -- cgit v1.2.3 From 5d8f72b55c275677865de670fa147ed318191d81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Oct 2012 19:46:06 +0200 Subject: freezer: change ptrace_stop/do_signal_stop to use freezable_schedule() try_to_freeze_tasks() and cgroup_freezer rely on scheduler locks to ensure that a task doing STOPPED/TRACED -> RUNNING transition can't escape freezing. This mostly works, but ptrace_stop() does not necessarily call schedule(), it can change task->state back to RUNNING and check freezing() without any lock/barrier in between. We could add the necessary barrier, but this patch changes ptrace_stop() and do_signal_stop() to use freezable_schedule(). This fixes the race, freezer_count() and freezer_should_skip() carefully avoid the race. And this simplifies the code, try_to_freeze_tasks/update_if_frozen no longer need to use task_is_stopped_or_traced() checks with the non trivial assumptions. We can rely on the mechanism which was specially designed to mark the sleeping task as "frozen enough". v2: As Tejun pointed out, we can also change get_signal_to_deliver() and move try_to_freeze() up before 'relock' label. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/cgroup_freezer.c | 3 +-- kernel/freezer.c | 11 ++--------- kernel/power/process.c | 13 +------------ kernel/signal.c | 20 ++++++-------------- 4 files changed, 10 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 8a92b0e52099..bedefd9a22df 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -198,8 +198,7 @@ static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) * completion. Consider it frozen in addition to * the usual frozen condition. */ - if (!frozen(task) && !task_is_stopped_or_traced(task) && - !freezer_should_skip(task)) + if (!frozen(task) && !freezer_should_skip(task)) goto notyet; } } diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) { + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); - /* - * fake_signal_wake_up() goes through p's scheduler - * lock and guarantees that TASK_STOPPED/TRACED -> - * TASK_RUNNING transition can't race with task state - * testing in try_to_freeze_tasks(). - */ - } else { + else wake_up_state(p, TASK_INTERRUPTIBLE); - } spin_unlock_irqrestore(&freezer_lock, flags); return true; diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e13..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - * - * Because freeze_task() goes through p's scheduler lock, it's - * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING - * transition can't race with task state testing here. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) + if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..5ffb5626e072 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); - schedule(); + freezable_schedule(); } else { /* * By the time we got the lock, our tracer went away. @@ -1929,13 +1929,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) read_unlock(&tasklist_lock); } - /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with @@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); + freezable_schedule(); return true; } else { /* @@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, if (unlikely(uprobe_deny_signal())) return 0; -relock: /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. + * Do this once, we can't return to user-mode if freezing() == T. + * do_signal_stop() and ptrace_stop() do freezable_schedule() and + * thus do not need another check after return. */ try_to_freeze(); +relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if -- cgit v1.2.3 From cda4dc813071e6cb04944c5a140610bd06acd295 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 13 Oct 2012 01:14:17 +0800 Subject: rcutorture: Use DEFINE_STATIC_SRCU() Use DEFINE_STATIC_SRCU() to simplify the rcutorture.c SRCU test code. Signed-off-by: Lai Jiangshan Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 41 ++++++----------------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a..f4019720ceca 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -339,7 +339,6 @@ rcu_stutter_wait(char *title) struct rcu_torture_ops { void (*init)(void); - void (*cleanup)(void); int (*readlock)(void); void (*read_delay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); @@ -431,7 +430,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -475,7 +473,6 @@ static void rcu_sync_torture_init(void) static struct rcu_torture_ops rcu_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, @@ -493,7 +490,6 @@ static struct rcu_torture_ops rcu_sync_ops = { static struct rcu_torture_ops rcu_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, @@ -536,7 +532,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -553,7 +548,6 @@ static struct rcu_torture_ops rcu_bh_ops = { static struct rcu_torture_ops rcu_bh_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -570,7 +564,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { static struct rcu_torture_ops rcu_bh_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, @@ -589,19 +582,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { * Definitions for srcu torture testing. */ -static struct srcu_struct srcu_ctl; - -static void srcu_torture_init(void) -{ - init_srcu_struct(&srcu_ctl); - rcu_sync_torture_init(); -} - -static void srcu_torture_cleanup(void) -{ - synchronize_srcu(&srcu_ctl); - cleanup_srcu_struct(&srcu_ctl); -} +DEFINE_STATIC_SRCU(srcu_ctl); static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) { @@ -672,8 +653,7 @@ static int srcu_torture_stats(char *page) } static struct rcu_torture_ops srcu_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -687,8 +667,7 @@ static struct rcu_torture_ops srcu_ops = { }; static struct rcu_torture_ops srcu_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -712,8 +691,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) } static struct rcu_torture_ops srcu_raw_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -727,8 +705,7 @@ static struct rcu_torture_ops srcu_raw_ops = { }; static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock_raw, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, @@ -747,8 +724,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_expedited_ops = { - .init = srcu_torture_init, - .cleanup = srcu_torture_cleanup, + .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, @@ -783,7 +759,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -799,7 +774,6 @@ static struct rcu_torture_ops sched_ops = { static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -814,7 +788,6 @@ static struct rcu_torture_ops sched_sync_ops = { static struct rcu_torture_ops sched_expedited_ops = { .init = rcu_sync_torture_init, - .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, @@ -1936,8 +1909,6 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup) - cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || -- cgit v1.2.3 From 11113334d1c5dd5355c86e531c29f1202a855c86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Oct 2012 18:05:51 +0200 Subject: vtime: Make vtime_account_system() irqsafe vtime_account_system() currently has only one caller with vtime_account() which is irq safe. Now we are going to call it from other places like kvm where irqs are not always disabled by the time we account the cputime. So let's make it irqsafe. The arch implementation part is now prefixed with "__". vtime_account_idle() arch implementation is prefixed accordingly to stay consistent. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- kernel/sched/cputime.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a6..0359f47b0ae4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -433,10 +433,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } +void vtime_account_system(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + __vtime_account_system(tsk); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system); + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * __vtime_account_system() and __vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -449,9 +459,9 @@ void vtime_account(struct task_struct *tsk) local_irq_save(flags); if (in_interrupt() || !is_idle_task(tsk)) - vtime_account_system(tsk); + __vtime_account_system(tsk); else - vtime_account_idle(tsk); + __vtime_account_idle(tsk); local_irq_restore(flags); } -- cgit v1.2.3 From fa5058f3b63153e0147ef65bcdb3a4ee63581346 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Oct 2012 04:07:19 +0200 Subject: cputime: Specialize irq vtime hooks With CONFIG_VIRT_CPU_ACCOUNTING, when vtime_account() is called in irq entry/exit, we perform a check on the context: if we are interrupting the idle task we account the pending cputime to idle, otherwise account to system time or its sub-areas: tsk->stime, hardirq time, softirq time, ... However this check for idle only concerns the hardirq entry and softirq entry: * Hardirq may directly interrupt the idle task, in which case we need to flush the pending CPU time to idle. * The idle task may be directly interrupted by a softirq if it calls local_bh_enable(). There is probably no such call in any idle task but we need to cover every case. Ksoftirqd is not concerned because the idle time is flushed on context switch and softirq in the end of hardirq have the idle time already flushed from the hardirq entry. In the other cases we always account to system/irq time: * On hardirq exit we account the time to hardirq time. * On softirq exit we account the time to softirq time. To optimize this and avoid the indirect call to vtime_account() and the checks it performs, specialize the vtime irq APIs and only perform the check on irq entry. Irq exit can directly call vtime_account_system(). CONFIG_IRQ_TIME_ACCOUNTING behaviour doesn't change and directly maps to its own vtime_account() implementation. One may want to take benefits from the new APIs to optimize irq time accounting as well in the future. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/softirq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc0c2c9..ed567babe789 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - vtime_account(current); + vtime_account_irq_enter(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart: lockdep_softirq_exit(); - vtime_account(current); + vtime_account_irq_exit(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - vtime_account(current); + vtime_account_irq_exit(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.3 From 3e1df4f506836e6bea1ab61cf88c75c8b1840643 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 6 Oct 2012 05:23:22 +0200 Subject: cputime: Separate irqtime accounting from generic vtime vtime_account() doesn't have the same role in CONFIG_VIRT_CPU_ACCOUNTING and CONFIG_IRQ_TIME_ACCOUNTING. In the first case it handles time accounting in any context. In the second case it only handles irq time accounting. So when vtime_account() is called from outside vtime_account_irq_*() this call is pointless to CONFIG_IRQ_TIME_ACCOUNTING. To fix the confusion, change vtime_account() to irqtime_account_irq() in CONFIG_IRQ_TIME_ACCOUNTING. This way we ensure future account_vtime() calls won't waste useless cycles in the irqtime APIs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0359f47b0ae4..8d859dae5bed 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq); static int irqtime_account_hi_update(void) { -- cgit v1.2.3 From 0d855354ea351bec6b222e9fea86a876cfafdcb6 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 26 Oct 2012 18:28:56 +0200 Subject: perf, powerpc: Fix hw breakpoints returning -ENOSPC I've been trying to get hardware breakpoints with perf to work on POWER7 but I'm getting the following: % perf record -e mem:0x10000000 true Error: sys_perf_event_open() syscall returned with 28 (No space left on device). /bin/dmesg may provide additional information. Fatal: No CONFIG_PERF_EVENTS=y kernel support configured? true: Terminated (FWIW adding -a and it works fine) Debugging it seems that __reserve_bp_slot() is returning ENOSPC because it thinks there are no free breakpoint slots on this CPU. I have a 2 CPUs, so perf userspace is doing two perf_event_open syscalls to add a counter to each CPU [1]. The first syscall succeeds but the second is failing. On this second syscall, fetch_bp_busy_slots() sets slots.pinned to be 1, despite there being no breakpoint on this CPU. This is because the call the task_bp_pinned, checks all CPUs, rather than just the current CPU. POWER7 only has one hardware breakpoint per CPU (ie. HBP_NUM=1), so we return ENOSPC. The following patch fixes this by checking the associated CPU for each breakpoint in task_bp_pinned. I'm not familiar with this code, so it's provided as a reference to the above issue. Signed-off-by: Michael Neuling Acked-by: Peter Zijlstra Cc: Michael Ellerman Cc: Jovi Zhang Cc: K Prasad Link: http://lkml.kernel.org/r/1351268936-2956-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9a7b487c6fe2..fe8a916507ed 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && + find_slot_idx(iter) == type && + cpu == iter->cpu) count += hw_breakpoint_weight(iter); } @@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(bp, type); + slots->pinned += task_bp_pinned(cpu, bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(bp, type); + nr += task_bp_pinned(cpu, bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(bp, type); + old_count = task_bp_pinned(cpu, bp, type); old_idx = old_count - 1; idx = old_idx + weight; -- cgit v1.2.3 From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 28 Oct 2012 12:19:23 -0700 Subject: sched/autogroup: Fix crash on reboot when autogroup is disabled Due to these two commits: 8323f26ce342 sched: Fix race in task_group() 800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled ... autogroup scheduling's dynamic knobs are wrecked. With both patches applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to 800d4d30 not allowing autogroup to move things, and 8323f26ce making that the only way to switch runqueues. Remove most of the (dysfunctional) knobs and turn the remaining sched_autogroup_enabled knob readonly. If the user fiddles with cgroups hereafter, once tasks are moved, autogroup won't mess with them again unless they call setsid(). No knobs, no glitz, nada, just a cute little thing folks can turn on if they don't want to muck about with cgroups and/or systemd. Signed-off-by: Mike Galbraith Cc: Xiaotian Feng Cc: Peter Zijlstra Cc: Xiaotian Feng Cc: Linus Torvalds Cc: Andrew Morton Cc: Oleg Nesterov Cc: # v3.6 Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 68 ++++++++--------------------------------------- kernel/sched/auto_group.h | 9 +------ kernel/sysctl.c | 6 ++--- 3 files changed, 14 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..0f1bacb005a4 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -110,6 +110,9 @@ out_fail: bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { + if (!sysctl_sched_autogroup_enabled) + return false; + if (tg != &root_task_group) return false; @@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -159,8 +158,11 @@ out: /* Allocates GFP_KERNEL, cannot be called under any spinlock */ void sched_autogroup_create_attach(struct task_struct *p) { - struct autogroup *ag = autogroup_create(); + struct autogroup *ag; + if (!sysctl_sched_autogroup_enabled) + return; + ag = autogroup_create(); autogroup_move_group(p, ag); /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); @@ -176,11 +178,15 @@ EXPORT_SYMBOL(sched_autogroup_detach); void sched_autogroup_fork(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; sig->autogroup = autogroup_task_get(current); } void sched_autogroup_exit(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; autogroup_kref_put(sig->autogroup); } @@ -193,58 +199,6 @@ static int __init setup_autogroup(char *str) __setup("noautogroup", setup_autogroup); -#ifdef CONFIG_PROC_FS - -int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) -{ - static unsigned long next = INITIAL_JIFFIES; - struct autogroup *ag; - int err; - - if (nice < -20 || nice > 19) - return -EINVAL; - - err = security_task_setnice(current, nice); - if (err) - return err; - - if (nice < 0 && !can_nice(current, nice)) - return -EPERM; - - /* this is a heavy operation taking global locks.. */ - if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) - return -EAGAIN; - - next = HZ / 10 + jiffies; - ag = autogroup_task_get(p); - - down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); - if (!err) - ag->nice = nice; - up_write(&ag->lock); - - autogroup_kref_put(ag); - - return err; -} - -void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) -{ - struct autogroup *ag = autogroup_task_get(p); - - if (!task_group_is_autogroup(ag->tg)) - goto out; - - down_read(&ag->lock); - seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); - up_read(&ag->lock); - -out: - autogroup_kref_put(ag); -} -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..4552c6bf79d2 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,11 +4,6 @@ #include struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; @@ -29,9 +24,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) + if (task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; return tg; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81c7b1a1a307..2914d0f752cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -363,10 +363,8 @@ static struct ctl_table kern_table[] = { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .mode = 0444, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3 From 59ef28b1f14899b10d6b2682c7057ca00a9a3f47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 25 Oct 2012 10:49:25 +1030 Subject: module: fix out-by-one error in kallsyms Masaki found and patched a kallsyms issue: the last symbol in a module's symtab wasn't transferred. This is because we manually copy the zero'th entry (which is always empty) then copy the rest in a loop starting at 1, though from src[0]. His fix was minimal, I prefer to rewrite the loops in more standard form. There are two loops: one to get the size, and one to copy. Make these identical: always count entry 0 and any defined symbol in an allocated non-init section. This bug exists since the following commit was introduced. module: reduce symbol table for loaded modules (v2) commit: 4a4962263f07d14660849ec134ee42b63e95ea9a LKML: http://lkml.org/lkml/2012/10/24/27 Reported-by: Masaki Kimura Cc: stable@kernel.org --- kernel/module.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6085f5ef88ea..6e48c3a43599 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2293,12 +2293,17 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); + /* strtab always starts with a nul, so offset 0 is the empty string. */ + strtab_size = 1; + /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - strtab_size += strlen(&info->strtab[src->st_name]) + 1; + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } + } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); @@ -2332,15 +2337,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *dst = *src; *s++ = 0; - for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) - continue; - - dst[ndst] = *src; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; + for (ndst = i = 0; i < mod->num_symtab; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } } mod->core_num_syms = ndst; } -- cgit v1.2.3 From bcd83ea6cbfee54e33d1527b87538dc99ca2137b Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Wed, 26 Sep 2012 22:08:38 +0200 Subject: tracing: Replace strict_strto* with kstrto* * remove old string conversions with kstrto* Link: http://lkml.kernel.org/r/20120926200838.GC1244@0x90.at Signed-off-by: Daniel Walter Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 14 +++++++------- kernel/trace/trace_uprobe.c | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9dcf15d38380..60ad606dc85f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4381,7 +4381,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, if (strlen(tmp) == 0) return 1; - ret = strict_strtol(tmp, 10, &val); + ret = kstrtol(tmp, 10, &val); if (ret < 0) return ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 31e4f55773f1..f6928edacd6d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -431,7 +431,7 @@ static int __init set_tracing_thresh(char *str) if (!str) return 0; - ret = strict_strtoul(str, 0, &threshold); + ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; tracing_thresh = threshold * 1000; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c154797a7ff7..e5b0ca8b8d4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps, } } else { if (field->is_signed) - ret = strict_strtoll(pred->regex.pattern, 0, &val); + ret = kstrtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->regex.pattern, 0, &val); + ret = kstrtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 507a7a9630bf..618dcf8bdb87 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -366,7 +366,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, * We use the callback data field (which is a pointer) * as our counter. */ - ret = strict_strtoul(number, 0, (unsigned long *)&count); + ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1a2117043bb1..5a3c533ef060 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); + ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daa9980153af..412e959709b4 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type) goto fail; type++; - if (strict_strtoul(type, 0, &bs)) + if (kstrtoul(type, 0, &bs)) goto fail; switch (bs) { @@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) tmp = strchr(symbol, '+'); if (tmp) { - /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtoul(tmp + 1, 0, offset); + /* skip sign because kstrtoul doesn't accept '+' */ + ret = kstrtoul(tmp + 1, 0, offset); if (ret) return ret; @@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, else ret = -EINVAL; } else if (isdigit(arg[5])) { - ret = strict_strtoul(arg + 5, 10, ¶m); + ret = kstrtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { @@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '@': /* memory or symbol */ if (isdigit(arg[1])) { - ret = strict_strtoul(arg + 1, 0, ¶m); + ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; @@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, break; case '+': /* deref memory */ - arg++; /* Skip '+', because strict_strtol() rejects it. */ + arg++; /* Skip '+', because kstrtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg, 0, &offset); + ret = kstrtol(arg, 0, &offset); if (ret) break; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 03003cd7dd96..4ff9ca4f359a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -252,7 +252,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = strict_strtoul(arg, 0, &offset); + ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; -- cgit v1.2.3 From 6f4156723c084bfc0c0f72205c541fafb8ad3ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Oct 2012 12:13:07 -0400 Subject: tracing: Allow tracers to start at core initcall There's times during debugging that it is helpful to see traces of early boot functions. But the tracers are initialized at device_initcall() which is quite late during the boot process. Setting the kernel command line parameter ftrace=function will not show anything until the function tracer is initialized. This prevents being able to trace functions before device_initcall(). There's no reason that the tracers need to be initialized so late in the boot process. Move them up to core_initcall() as they still need to come after early_initcall() which initializes the tracing buffers. Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_functions.c | 3 +-- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- 6 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 60ad606dc85f..4451aa3a55a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2868,7 +2868,7 @@ static int __init ftrace_mod_cmd_init(void) { return register_ftrace_command(&ftrace_mod_cmd); } -device_initcall(ftrace_mod_cmd_init); +core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -4055,7 +4055,7 @@ static int __init ftrace_nodyn_init(void) ftrace_enabled = 1; return 0; } -device_initcall(ftrace_nodyn_init); +core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5f..bd3e0eef4eaa 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -199,7 +199,7 @@ __init static int init_branch_tracer(void) } return register_tracer(&branch_trace); } -device_initcall(init_branch_tracer); +core_initcall(init_branch_tracer); #else static inline diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 618dcf8bdb87..bb227e380cb5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -411,5 +411,4 @@ static __init int init_function_trace(void) init_func_cmd_traceon(); return register_tracer(&function_trace); } -device_initcall(init_function_trace); - +core_initcall(init_function_trace); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 99b4378393d5..a84b55879bc4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1474,4 +1474,4 @@ static __init int init_graph_trace(void) return register_tracer(&graph_trace); } -device_initcall(init_graph_trace); +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d98ee8283b29..11edebda4548 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -698,4 +698,4 @@ __init static int init_irqsoff_tracer(void) return 0; } -device_initcall(init_irqsoff_tracer); +core_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 02170c00c413..2f6af7833694 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -637,4 +637,4 @@ __init static int init_wakeup_tracer(void) return 0; } -device_initcall(init_wakeup_tracer); +core_initcall(init_wakeup_tracer); -- cgit v1.2.3 From f43c738bfa8608424610e4fc1aef4d4644e2ce11 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Tue, 2 Oct 2012 17:27:10 +0900 Subject: tracing: Change tracer's integer flags to bool print_max and use_max_tr in struct tracer are "int" variables and used like flags. This is wasteful, so change the type to "bool". Link: http://lkml.kernel.org/r/20121002082710.9807.86393.stgit@falsita Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_irqsoff.c | 12 ++++++------ kernel/trace/trace_sched_wakeup.c | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c15f528c1af4..c56a233c006e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -285,8 +285,8 @@ struct tracer { int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; struct tracer_flags *flags; - int print_max; - int use_max_tr; + bool print_max; + bool use_max_tr; }; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 11edebda4548..5ffce7b0f33c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -604,7 +604,7 @@ static struct tracer irqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -614,7 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -637,7 +637,7 @@ static struct tracer preemptoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -647,7 +647,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -672,7 +672,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, @@ -682,7 +682,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2f6af7833694..bc64fc137554 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -589,7 +589,7 @@ static struct tracer wakeup_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, @@ -599,7 +599,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -610,7 +610,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, @@ -620,7 +620,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .use_max_tr = true, }; __init static int init_wakeup_tracer(void) -- cgit v1.2.3 From 884bfe89a462fcc85c8abd96171519cf2fe70929 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Fri, 15 Jul 2011 14:23:58 -0700 Subject: ring-buffer: Add a 'dropped events' counter The existing 'overrun' counter is incremented when the ring buffer wraps around, with overflow on (the default). We wanted a way to count requests lost from the buffer filling up with overflow off, too. I decided to add a new counter instead of retro-fitting the existing one because it seems like a different statistic to count conceptually, and also because of how the code was structured. Link: http://lkml.kernel.org/r/1310765038-26399-1-git-send-email-slavapestov@google.com Signed-off-by: Slava Pestov Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++++++++++++++------ kernel/trace/trace.c | 3 +++ 2 files changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b979426d16c6..0ebeb1d76ddf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -460,9 +460,10 @@ struct ring_buffer_per_cpu { unsigned long lost_events; unsigned long last_overrun; local_t entries_bytes; - local_t commit_overrun; - local_t overrun; local_t entries; + local_t overrun; + local_t commit_overrun; + local_t dropped_events; local_t committing; local_t commits; unsigned long read; @@ -2155,8 +2156,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are not in overwrite mode, * this is easy, just stop here. */ - if (!(buffer->flags & RB_FL_OVERWRITE)) + if (!(buffer->flags & RB_FL_OVERWRITE)) { + local_inc(&cpu_buffer->dropped_events); goto out_reset; + } ret = rb_handle_head_page(cpu_buffer, tail_page, @@ -2995,7 +2998,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** - * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -3015,7 +3019,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** - * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -3035,6 +3041,28 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); +/** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->dropped_events); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer @@ -3864,9 +3892,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6928edacd6d..36c213fbfce7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4385,6 +4385,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "dropped events: %ld\n", cnt); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3 From b382ede6b5eb8188926b72a9ef42fd2354342a97 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Oct 2012 21:44:34 -0400 Subject: tracing: Expand ring buffer when trace_printk() is used Since tracing is not used by 99% of Linux users, even though tracing may be configured in, it does not make sense to allocate 1.4 Megs per CPU for the ring buffers if they are not used. Thus, on boot up the ring buffers are set to a minimal size until something needs the and they are expanded. This works well for events and tracers (function, etc), but for the asynchronous use of trace_printk() which can write to the ring buffer at any time, does not expand the buffers. On boot up a check is made to see if any trace_printk() is used to see if the trace_printk() temp buffer pages should be allocated. This same code can be used to expand the buffers as well. Suggested-by: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 36c213fbfce7..a5411b7414b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1571,6 +1571,9 @@ void trace_printk_init_buffers(void) pr_info("ftrace: Allocated trace_printk buffers\n"); + /* Expand the buffers to set size */ + tracing_update_buffers(); + buffers_allocated = 1; } @@ -3030,6 +3033,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) */ ring_buffer_expanded = 1; + /* May be called before buffers are initialized */ + if (!global_trace.buffer) + return 0; + ret = ring_buffer_resize(global_trace.buffer, size, cpu); if (ret < 0) return ret; -- cgit v1.2.3 From 81698831bc462ff16f76bc11249a1e492424da4c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Oct 2012 10:15:05 -0400 Subject: tracing: Enable comm recording if trace_printk() is used If comm recording is not enabled when trace_printk() is used then you just get this type of output: [ adding trace_printk("hello! %d", irq); in do_IRQ ] <...>-2843 [001] d.h. 80.812300: do_IRQ: hello! 14 <...>-2734 [002] d.h2 80.824664: do_IRQ: hello! 14 <...>-2713 [003] d.h. 80.829971: do_IRQ: hello! 14 <...>-2814 [000] d.h. 80.833026: do_IRQ: hello! 14 By enabling the comm recorder when trace_printk is enabled: hackbench-6715 [001] d.h. 193.233776: do_IRQ: hello! 21 sshd-2659 [001] d.h. 193.665862: do_IRQ: hello! 21 -0 [001] d.h1 193.665996: do_IRQ: hello! 21 Suggested-by: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 36 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 3 +++ 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a5411b7414b1..b90a827a4641 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1559,10 +1559,10 @@ static int alloc_percpu_trace_buffer(void) return -ENOMEM; } +static int buffers_allocated; + void trace_printk_init_buffers(void) { - static int buffers_allocated; - if (buffers_allocated) return; @@ -1575,6 +1575,34 @@ void trace_printk_init_buffers(void) tracing_update_buffers(); buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. If the global_trace.buffer is already + * allocated here, then this was called by module code. + */ + if (global_trace.buffer) + tracing_start_cmdline_record(); +} + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); } /** @@ -2797,6 +2825,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_OVERWRITE) ring_buffer_change_overwrite(global_trace.buffer, enabled); + + if (mask == TRACE_ITER_PRINTK) + trace_printk_start_stop_comm(enabled); } static ssize_t @@ -5099,6 +5130,7 @@ __init static int tracer_alloc_buffers(void) /* Only allocate trace_printk buffers if a trace_printk exists */ if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c56a233c006e..7824a55bd3fc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -841,6 +841,7 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); +void trace_printk_start_comm(void); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d608d09d08c0..dec47e70e254 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1489,6 +1489,9 @@ static __init int event_trace_enable(void) if (ret) pr_warn("Failed to enable trace event: %s\n", token); } + + trace_printk_start_comm(); + return 0; } -- cgit v1.2.3 From 2b70e59043f5a5ec083ea50cd2640aa49c64c675 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Oct 2012 11:14:14 -0400 Subject: tracing: Have tracing_sched_wakeup_trace() use standard unlock_commit The functon tracing_sched_wakeup_trace() does an open coded unlock commit and save stack. This is what the trace_nowake_buffer_unlock_commit() is for. Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a18456..b0a136ac382a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr->buffer, flags, 6, pc); - ftrace_trace_userstack(tr->buffer, flags, pc); + trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); } static void -- cgit v1.2.3 From 7ffbd48d5cab22bcd1120eb2349db1319e2d827a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Oct 2012 12:14:25 -0400 Subject: tracing: Cache comms only after an event occurred Whenever an event is registered, the comm of tasks are saved at every task switch instead of saving them at every event. But if an event isn't executed much, the comm cache will be filled up by tasks that did not record the event and you lose out on the comms that did. Here's an example, if you enable the following events: echo 1 > /debug/tracing/events/kvm/kvm_cr/enable echo 1 > /debug/tracing/events/net/net_dev_xmit/enable Note, there's no kvm running on this machine so the first event will never be triggered, but because it is enabled, the storing of comms will continue. If we now disable the network event: echo 0 > /debug/tracing/events/net/net_dev_xmit/enable and look at the trace: cat /debug/tracing/trace sshd-2672 [001] ..s2 375.731616: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.731617: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 375.859356: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.859357: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 375.947351: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 375.947352: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 376.035383: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s1 376.035383: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 sshd-2672 [001] ..s2 377.563806: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=226 rc=0 sshd-2672 [001] ..s1 377.563807: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=226 rc=0 sshd-2672 [001] ..s2 377.563834: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6be0 len=114 rc=0 sshd-2672 [001] ..s1 377.563842: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6be0 len=114 rc=0 We see that process 2672 which triggered the events has the comm "sshd". But if we run hackbench for a bit and look again: cat /debug/tracing/trace <...>-2672 [001] ..s2 375.731616: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.731617: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 375.859356: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.859357: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 375.947351: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 375.947352: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 376.035383: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s1 376.035383: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=242 rc=0 <...>-2672 [001] ..s2 377.563806: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6de0 len=226 rc=0 <...>-2672 [001] ..s1 377.563807: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6de0 len=226 rc=0 <...>-2672 [001] ..s2 377.563834: net_dev_xmit: dev=eth0 skbaddr=ffff88005cbb6be0 len=114 rc=0 <...>-2672 [001] ..s1 377.563842: net_dev_xmit: dev=br0 skbaddr=ffff88005cbb6be0 len=114 rc=0 The stored "sshd" comm has been flushed out and we get a useless "<...>". But by only storing comms after a trace event occurred, we can run hackbench all day and still get the same output. Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 35 +++++++++++++++++++++++++++-------- kernel/trace/trace.h | 3 +++ kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_functions_graph.c | 4 ++-- 4 files changed, 33 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b90a827a4641..88111b08b2c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -77,6 +77,13 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) return 0; } +/* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization @@ -1135,6 +1142,11 @@ void tracing_record_cmdline(struct task_struct *tsk) !tracing_is_on()) return; + if (!__this_cpu_read(trace_cmdline_save)) + return; + + __this_cpu_write(trace_cmdline_save, false); + trace_save_cmdline(tsk); } @@ -1178,13 +1190,20 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + ring_buffer_unlock_commit(buffer, event); +} + static inline void __trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, int wake) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); @@ -1232,7 +1251,7 @@ void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, unsigned long flags, int pc, struct pt_regs *regs) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); @@ -1269,7 +1288,7 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void @@ -1362,7 +1381,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -1458,7 +1477,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -1653,7 +1672,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -1724,7 +1743,7 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } out: @@ -3993,7 +4012,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); written = cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7824a55bd3fc..839ae003a053 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -359,6 +359,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +void __buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index bd3e0eef4eaa..95e96842ed29 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a84b55879bc4..4edb4b74eb7e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -223,7 +223,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); return 1; } @@ -327,7 +327,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) -- cgit v1.2.3 From 01e3e710a9265fb7092efd67243d7b6dd6e2548a Mon Sep 17 00:00:00 2001 From: David Sharp Date: Thu, 7 Jun 2012 16:46:24 -0700 Subject: tracing: Trivial cleanup Remove ftrace_format_syscall() declaration; it is neither defined nor used. Also update a comment and formatting. Link: http://lkml.kernel.org/r/1339112785-21806-1-git-send-email-vnagarnaik@google.com Signed-off-by: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0ebeb1d76ddf..23a384b92512 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1821,7 +1821,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) } /** - * ring_buffer_update_event - update event type and data + * rb_update_event - update event type and data * @event: the even to update * @type: the type of event * @length: the size of the event field in the ring buffer @@ -2723,8 +2723,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); * and not the length of the event which would hold the header. */ int ring_buffer_write(struct ring_buffer *buffer, - unsigned long length, - void *data) + unsigned long length, + void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; -- cgit v1.2.3 From 6f86ab9fcaef122abb837819139eadac1a0ca966 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 7 Jun 2012 16:46:25 -0700 Subject: tracing: Cleanup unnecessary function declarations The functions defined in include/trace/syscalls.h are not used directly since struct ftrace_event_class was introduced. Remove them from the header file and rearrange the ftrace_event_class declarations in trace_syscalls.c. Link: http://lkml.kernel.org/r/1339112785-21806-2-git-send-email-vnagarnaik@google.com Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 61 ++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 2485a7d09b11..7609dd6714c2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -21,9 +21,6 @@ static int syscall_enter_register(struct ftrace_event_call *event, static int syscall_exit_register(struct ftrace_event_call *event, enum trace_reg type, void *data); -static int syscall_enter_define_fields(struct ftrace_event_call *call); -static int syscall_exit_define_fields(struct ftrace_event_call *call); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -32,30 +29,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, -}; - -struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, -}; - -struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, -}; - -struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), - .raw_init = init_syscall_trace, -}; - extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; @@ -432,7 +405,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int init_syscall_trace(struct ftrace_event_call *call) +static int init_syscall_trace(struct ftrace_event_call *call) { int id; int num; @@ -457,6 +430,30 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; @@ -537,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -558,7 +555,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) return ret; } -void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct ftrace_event_call *call) { int num; @@ -615,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -636,7 +633,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) return ret; } -void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct ftrace_event_call *call) { int num; -- cgit v1.2.3 From 59fa6245192159ab5e1e17b8e31f15afa9cff4bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Oct 2012 22:29:38 +0200 Subject: futex: Handle futex_pi OWNER_DIED take over correctly Siddhesh analyzed a failure in the take over of pi futexes in case the owner died and provided a workaround. See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076 The detailed problem analysis shows: Futex F is initialized with PTHREAD_PRIO_INHERIT and PTHREAD_MUTEX_ROBUST_NP attributes. T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Succeeds due to the check for F's userspace TID field == 0 --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes T2 via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains real ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T3 --> observes inconsistent state This problem is independent of UP/SMP, preemptible/non preemptible kernels, or process shared vs. private. The only difference is that certain configurations are more likely to expose it. So as Siddhesh correctly analyzed the following check in futex_lock_pi_atomic() is the culprit: if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { We check the userspace value for a TID value of 0 and take over the futex unconditionally if that's true. AFAICT this check is there as it is correct for a different corner case of futexes: the WAITERS bit became stale. Now the proposed change - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + if (unlikely(ownerdied || + !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) { solves the problem, but it's not obvious why and it wreckages the "stale WAITERS bit" case. What happens is, that due to the WAITERS bit being set (T2 is blocked on that futex) it enforces T3 to go through lookup_pi_state(), which in the above case returns an existing pi_state and therefor forces T3 to legitimately fight with T2 over the ownership of the pi_state (via pi_state->mutex). Probelm solved! Though that does not work for the "WAITERS bit is stale" problem because if lookup_pi_state() does not find existing pi_state it returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to return -ESRCH to user space because the OWNER_DIED bit is not set. Now there is a different solution to that problem. Do not look at the user space value at all and enforce a lookup of possibly available pi_state. If pi_state can be found, then the new incoming locker T3 blocks on that pi_state and legitimately races with T2 to acquire the rt_mutex and the pi_state and therefor the proper ownership of the user space futex. lookup_pi_state() has the correct order of checks. It first tries to find a pi_state associated with the user space futex and only if that fails it checks for futex TID value = 0. If no pi_state is available nothing can create new state at that point because this happens with the hash bucket lock held. So the above scenario changes to: T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Finds pi_state and blocks on pi_state->rt_mutex T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes it via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space This covers all gazillion points on which T3 might come in between T1's exit_robust_list() clearing the TID field and T2 fixing it up. It also solves the "WAITERS bit stale" problem by forcing the take over. Another benefit of changing the code this way is that it makes it less dependent on untrusted user space values and therefor minimizes the possible wreckage which might be inflicted. As usual after staring for too long at the futex code my brain hurts so much that I really want to ditch that whole optimization of avoiding the syscall for the non contended case for PI futexes and rip out the maze of corner case handling code. Unfortunately we can't as user space relies on that existing behaviour, but at least thinking about it helps me to preserve my mental sanity. Maybe we should nevertheless :) Reported-and-tested-by: Siddhesh Poyarekar Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos Acked-by: Darren Hart Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3717e7b306e0..20ef219bbe9b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, ownerdied = 0; + int lock_taken, ret, force_take = 0; u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: @@ -755,17 +755,15 @@ retry: newval = curval | FUTEX_WAITERS; /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! + * Should we force take the futex? See below. */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ + if (unlikely(force_take)) { + /* + * Keep the OWNER_DIED and the WAITERS bit and set the + * new TID value. + */ newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; + force_take = 0; lock_taken = 1; } @@ -775,7 +773,7 @@ retry: goto retry; /* - * We took the lock due to owner died take over. + * We took the lock due to forced take over. */ if (unlikely(lock_taken)) return 1; @@ -790,20 +788,25 @@ retry: switch (ret) { case -ESRCH: /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. + * We failed to find an owner for this + * futex. So we have no pi_state to block + * on. This can happen in two cases: + * + * 1) The owner died + * 2) A stale FUTEX_WAITERS bit + * + * Re-read the futex value. */ if (get_futex_value_locked(&curval, uaddr)) return -EFAULT; /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. + * If the owner died or we have a stale + * WAITERS bit the owner TID in the user space + * futex is 0. */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; + if (!(curval & FUTEX_TID_MASK)) { + force_take = 1; goto retry; } default: -- cgit v1.2.3 From 293a7a0a165c4f8327bbcf396cee9ec672727c98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Oct 2012 15:07:49 -0700 Subject: genirq: Provide means to retrigger parent Attempts to retrigger nested threaded IRQs currently fail because they have no primary handler. In order to support retrigger of nested IRQs, the parent IRQ needs to be retriggered. To fix, when an IRQ needs to be resent, if the interrupt has a parent IRQ and runs in the context of the parent IRQ, then resend the parent. Also, handle_nested_irq() needs to clear the replay flag like the other handlers, otherwise check_irq_resend() will set it and it will never be cleared. Without clearing, it results in the first resend working fine, but check_irq_resend() returning early on subsequent resends because the replay flag is still set. Problem discovered on ARM/OMAP platforms where a nested IRQ that's also a wakeup IRQ happens late in suspend and needed to be retriggered during the resume process. [khilman@ti.com: changelog edits, clear IRQS_REPLAY in handle_nested_irq()] Reported-by: Kevin Hilman Tested-by: Kevin Hilman Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1350425269-11489-1-git-send-email-khilman@deeprootsystems.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 + kernel/irq/manage.c | 16 ++++++++++++++++ kernel/irq/resend.c | 8 ++++++++ 3 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 57d86d07221e..3aca9f29d30e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -272,6 +272,7 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c69326aa773..d06a396c7ce3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -616,6 +616,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +#ifdef CONFIG_HARDIRQS_SW_RESEND +int irq_set_parent(int irq, int parent_irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + + if (!desc) + return -EINVAL; + + desc->parent_irq = parent_irq; + + irq_put_desc_unlock(desc, flags); + return 0; +} +#endif + /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 6454db7b6a4d..9065107f083e 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND + /* + * If the interrupt has a parent irq and runs + * in the thread context of the parent irq, + * retrigger the parent. + */ + if (desc->parent_irq && + irq_settings_is_nested_thread(desc)) + irq = desc->parent_irq; /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); tasklet_schedule(&resend_tasklet); -- cgit v1.2.3 From f3de44edf376d18773febca6a37800c042bada7d Mon Sep 17 00:00:00 2001 From: Sankara Muthukrishnan Date: Wed, 31 Oct 2012 15:41:23 -0500 Subject: irq: Set CPU affinity right on thread creation As irq_thread_check_affinity is called ONLY inside the while loop in the irq thread, the core affinity is set only when an interrupt occurs. This patch sets the core affinity right after the irq thread is created and before it waits for interrupts. In real-tiime targets that do not typically change the core affinity of irqs during run-time, this patch will save additional latency of an irq thread in setting the core affinity during the first interrupt occurrence for that irq. Signed-off-by: Sankara S Muthukrishnan Acked-by: Steven Rostedt Link: http://lkml.kernel.org/r/CAFQPvXeVZ858WFYimEU5uvLNxLDd6bJMmqWihFmbCf3ntokz0A@mail.gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d06a396c7ce3..1cbd572f6ad8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -849,6 +849,8 @@ static int irq_thread(void *data) init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); + irq_thread_check_affinity(desc, action); + while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; -- cgit v1.2.3 From b8f61116c1ce342804a0897b0a80eb4df5f19453 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 25 Oct 2012 01:07:35 +0800 Subject: tick: Correct the comments for tick_sched_timer() In the comments of function tick_sched_timer(), the sentence "timer->base->cpu_base->lock held" is not right. In function __run_hrtimer(), before call timer->function(), the cpu_base->lock has been unlocked. Signed-off-by: liu chuansheng Cc: fei.li@intel.com Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1351098455.15558.1421.camel@cliu38-desktop-build Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 766d4c47a4a4..77729cc3750b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -809,7 +809,7 @@ void tick_check_idle(int cpu) #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. - * Called with interrupts disabled and timer->base->cpu_base->lock held. + * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { -- cgit v1.2.3 From 65f8f9a1c1db831e5159e3e3e50912d1f214cd0c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 31 Oct 2012 06:27:25 +0000 Subject: time: remove the timecompare code. This patch removes the timecompare code from the kernel. The top five reasons to do this are: 1. There are no more users of this code. 2. The original idea was a bit weak. 3. The original author has disappeared. 4. The code was not general purpose but tuned to a particular hardware, 5. There are better ways to accomplish clock synchronization. Signed-off-by: Richard Cochran Acked-by: John Stultz Tested-by: Bob Liu Signed-off-by: David S. Miller --- kernel/time/Makefile | 2 +- kernel/time/timecompare.c | 193 ---------------------------------------------- 2 files changed, 1 insertion(+), 194 deletions(-) delete mode 100644 kernel/time/timecompare.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c2..ff7d9d2ab504 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c deleted file mode 100644 index a9ae369925ce..000000000000 --- a/kernel/time/timecompare.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (C) 2009 Intel Corporation. - * Author: Patrick Ohly - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -/* - * fixed point arithmetic scale factor for skew - * - * Usually one would measure skew in ppb (parts per billion, 1e9), but - * using a factor of 2 simplifies the math. - */ -#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) - -ktime_t timecompare_transform(struct timecompare *sync, - u64 source_tstamp) -{ - u64 nsec; - - nsec = source_tstamp + sync->offset; - nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / - TIMECOMPARE_SKEW_RESOLUTION; - - return ns_to_ktime(nsec); -} -EXPORT_SYMBOL_GPL(timecompare_transform); - -int timecompare_offset(struct timecompare *sync, - s64 *offset, - u64 *source_tstamp) -{ - u64 start_source = 0, end_source = 0; - struct { - s64 offset; - s64 duration_target; - } buffer[10], sample, *samples; - int counter = 0, i; - int used; - int index; - int num_samples = sync->num_samples; - - if (num_samples > ARRAY_SIZE(buffer)) { - samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); - if (!samples) { - samples = buffer; - num_samples = ARRAY_SIZE(buffer); - } - } else { - samples = buffer; - } - - /* run until we have enough valid samples, but do not try forever */ - i = 0; - counter = 0; - while (1) { - u64 ts; - ktime_t start, end; - - start = sync->target(); - ts = timecounter_read(sync->source); - end = sync->target(); - - if (!i) - start_source = ts; - - /* ignore negative durations */ - sample.duration_target = ktime_to_ns(ktime_sub(end, start)); - if (sample.duration_target >= 0) { - /* - * assume symetric delay to and from source: - * average target time corresponds to measured - * source time - */ - sample.offset = - (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - - ts; - - /* simple insertion sort based on duration */ - index = counter - 1; - while (index >= 0) { - if (samples[index].duration_target < - sample.duration_target) - break; - samples[index + 1] = samples[index]; - index--; - } - samples[index + 1] = sample; - counter++; - } - - i++; - if (counter >= num_samples || i >= 100000) { - end_source = ts; - break; - } - } - - *source_tstamp = (end_source + start_source) / 2; - - /* remove outliers by only using 75% of the samples */ - used = counter * 3 / 4; - if (!used) - used = counter; - if (used) { - /* calculate average */ - s64 off = 0; - for (index = 0; index < used; index++) - off += samples[index].offset; - *offset = div_s64(off, used); - } - - if (samples && samples != buffer) - kfree(samples); - - return used; -} -EXPORT_SYMBOL_GPL(timecompare_offset); - -void __timecompare_update(struct timecompare *sync, - u64 source_tstamp) -{ - s64 offset; - u64 average_time; - - if (!timecompare_offset(sync, &offset, &average_time)) - return; - - if (!sync->last_update) { - sync->last_update = average_time; - sync->offset = offset; - sync->skew = 0; - } else { - s64 delta_nsec = average_time - sync->last_update; - - /* avoid division by negative or small deltas */ - if (delta_nsec >= 10000) { - s64 delta_offset_nsec = offset - sync->offset; - s64 skew; /* delta_offset_nsec * - TIMECOMPARE_SKEW_RESOLUTION / - delta_nsec */ - u64 divisor; - - /* div_s64() is limited to 32 bit divisor */ - skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; - divisor = delta_nsec; - while (unlikely(divisor >= ((s64)1) << 32)) { - /* divide both by 2; beware, right shift - of negative value has undefined - behavior and can only be used for - the positive divisor */ - skew = div_s64(skew, 2); - divisor >>= 1; - } - skew = div_s64(skew, divisor); - - /* - * Calculate new overall skew as 4/16 the - * old value and 12/16 the new one. This is - * a rather arbitrary tradeoff between - * only using the latest measurement (0/16 and - * 16/16) and even more weight on past measurements. - */ -#define TIMECOMPARE_NEW_SKEW_PER_16 12 - sync->skew = - div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * - sync->skew + - TIMECOMPARE_NEW_SKEW_PER_16 * skew, - 16); - sync->last_update = average_time; - sync->offset = offset; - } - } -} -EXPORT_SYMBOL_GPL(__timecompare_update); -- cgit v1.2.3 From 60303ed3f4b9332b9aa9bc17c68bc174e7343e2d Mon Sep 17 00:00:00 2001 From: David Sharp Date: Thu, 11 Oct 2012 16:27:52 -0700 Subject: tracing: Reset ring buffer when changing trace_clocks Because the "tsc" clock isn't in nanoseconds, the ring buffer must be reset when changing clocks so that incomparable timestamps don't end up in the same trace. Tested: Confirmed switching clocks resets the trace buffer. Google-Bug-Id: 6980623 Link: http://lkml.kernel.org/r/1349998076-15495-3-git-send-email-dhsharp@google.com Cc: Masami Hiramatsu Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88111b08b2c1..6ed6013dff2b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4073,6 +4073,14 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (max_tr.buffer) ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + /* + * New clock may not be consistent with the previous clock. + * Reset the buffer so that it doesn't have incomparable timestamps. + */ + tracing_reset_online_cpus(&global_trace); + if (max_tr.buffer) + tracing_reset_online_cpus(&max_tr); + mutex_unlock(&trace_types_lock); *fpos += cnt; -- cgit v1.2.3 From 50ecf2c3afead23a05227ab004e4212eca08c207 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 11 Oct 2012 16:27:54 -0700 Subject: ring-buffer: Change unsigned long type of ring_buffer_oldest_event_ts() to u64 ring_buffer_oldest_event_ts() should return a value of u64 type, because ring_buffer_per_cpu->buffer_page->buffer_data_page->time_stamp is u64 type. Link: http://lkml.kernel.org/r/1349998076-15495-5-git-send-email-dhsharp@google.com Cc: Frederic Weisbecker Cc: Vaibhav Nagarnaik Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 23a384b92512..3c7834c24e54 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2932,12 +2932,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ -unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long ret; + u64 ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; -- cgit v1.2.3 From 15075cac423d634ddf39dac66f943b3bce847f87 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 May 2012 14:57:28 -0400 Subject: tracing: Separate open function from set_event and available_events The open function used by available_events is the same as set_event even though it uses different seq functions. This causes a side effect of writing into available_events clearing all events, even though available_events is suppose to be read only. There's no reason to keep a single function for just the open and have both use different functions for everything else. It is a little confusing and causes strange behavior. Just have each have their own function. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index dec47e70e254..cb2df3b70f7f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -491,19 +491,6 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int -ftrace_event_seq_open(struct inode *inode, struct file *file) -{ - const struct seq_operations *seq_ops; - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_clear_events(); - - seq_ops = inode->i_private; - return seq_open(file, seq_ops); -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -980,6 +967,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -995,14 +985,14 @@ static const struct seq_operations show_set_event_seq_ops = { }; static const struct file_operations ftrace_avail_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, @@ -1078,6 +1068,26 @@ static struct dentry *event_trace_events_dir(void) return d_events; } +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_event_seq_ops; + + return seq_open(file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_event_seq_ops; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(); + + return seq_open(file, seq_ops); +} + static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1508,15 +1518,13 @@ static __init int event_trace_init(void) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, - (void *)&show_event_seq_ops, - &ftrace_avail_fops); + NULL, &ftrace_avail_fops); if (!entry) pr_warning("Could not create debugfs " "'available_events' entry\n"); entry = debugfs_create_file("set_event", 0644, d_tracer, - (void *)&show_set_event_seq_ops, - &ftrace_set_event_fops); + NULL, &ftrace_set_event_fops); if (!entry) pr_warning("Could not create debugfs " "'set_event' entry\n"); -- cgit v1.2.3 From c7b84ecada9a8b7fe3e6c081e70801703897ed5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 May 2012 20:54:53 -0400 Subject: tracing: Remove unused function unregister_tracer() The function register_tracer() is only used by kernel core code, that never needs to remove the tracer. As trace_events have become the main way to add new tracing to the kernel, the need to unregister a tracer has diminished. Remove the unused function unregister_tracer(). If a need arises where we need it, then we can always add it back. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 26 -------------------------- kernel/trace/trace.h | 1 - 2 files changed, 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6ed6013dff2b..d1d8039578ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -882,32 +882,6 @@ int register_tracer(struct tracer *type) return ret; } -void unregister_tracer(struct tracer *type) -{ - struct tracer **t; - - mutex_lock(&trace_types_lock); - for (t = &trace_types; *t; t = &(*t)->next) { - if (*t == type) - goto found; - } - pr_info("Tracer %s not registered\n", type->name); - goto out; - - found: - *t = (*t)->next; - - if (type == current_trace && tracer_enabled) { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(&global_trace); - current_trace = &nop_trace; - } -out: - mutex_unlock(&trace_types_lock); -} - void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 839ae003a053..3e8a176f64e6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -410,7 +410,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); -void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); enum trace_file_type { TRACE_FILE_LAT_FMT = 1, -- cgit v1.2.3 From 0fb9656d957d79dbe7ae155bb6533b1d465e4a50 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 May 2012 14:25:30 -0400 Subject: tracing: Make tracing_enabled be equal to tracing_on The tracing_enabled file has been deprecated as it never was able to serve its purpose well. The tracing_on file has taken over. Instead of having code to keep tracing_enabled, have the tracing_enabled file just set tracing_on, and remove the tracing_enabled variable. This allows us to remove the tracing_enabled file. The reason that the remove is in a different change set and not removed here is in case we find some lonely userspace tool that requires the file to exist. Then the removal patch will get reverted, but this one will not. Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 79 +++---------------------------------------- kernel/trace/trace_selftest.c | 12 ------- 2 files changed, 5 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1d8039578ab..3c9b96aee51a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -205,20 +205,9 @@ static struct trace_array max_tr; static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); -/* tracer_enabled is used to toggle activation of a tracer */ -static int tracer_enabled = 1; - -/** - * tracing_is_enabled - return tracer_enabled status - * - * This function is used by other tracers to know the status - * of the tracer_enabled flag. Tracers may use this function - * to know if it should enable their features when starting - * up. See irqsoff tracer for an example (start_irqsoff_tracer). - */ int tracing_is_enabled(void) { - return tracer_enabled; + return tracing_is_on(); } /* @@ -1112,8 +1101,7 @@ void trace_find_cmdline(int pid, char comm[]) void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || - !tracing_is_on()) + if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) return; if (!__this_cpu_read(trace_cmdline_save)) @@ -2966,56 +2954,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = { .llseek = generic_file_llseek, }; -static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - r = sprintf(buf, "%u\n", tracer_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - val = !!val; - - mutex_lock(&trace_types_lock); - if (tracer_enabled ^ val) { - - /* Only need to warn if this is used to change the state */ - WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); - - if (val) { - tracer_enabled = 1; - if (current_trace->start) - current_trace->start(tr); - tracing_start(); - } else { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(tr); - } - } - mutex_unlock(&trace_types_lock); - - *ppos += cnt; - - return cnt; -} - static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3469,7 +3407,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is disabled. + * We block until we read something and tracing is enabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3477,7 +3415,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracer_enabled && iter->pos) + if (tracing_is_enabled() && iter->pos) break; } @@ -4076,13 +4014,6 @@ static const struct file_operations tracing_max_lat_fops = { .llseek = generic_file_llseek, }; -static const struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, - .llseek = generic_file_llseek, -}; - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, @@ -4858,7 +4789,7 @@ static __init int tracer_init_debugfs(void) d_tracer = tracing_init_dentry(); trace_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); + &global_trace, &rb_simple_fops); trace_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2c00a691a540..091b815f7b0a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; char *func_name; int ret; @@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing, and record the filter function */ ftrace_enabled = 1; - tracer_enabled = 1; /* passed in by parameter to fool gcc from optimizing */ func(); @@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ ftrace_set_global_filter(NULL, 0, 1); @@ -452,7 +449,6 @@ static int trace_selftest_function_recursion(void) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; char *func_name; int len; int ret; @@ -465,7 +461,6 @@ trace_selftest_function_recursion(void) /* enable tracing, and record the filter function */ ftrace_enabled = 1; - tracer_enabled = 1; /* Handle PPC64 '.' name */ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); @@ -534,7 +529,6 @@ trace_selftest_function_recursion(void) ret = 0; out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; return ret; } @@ -569,7 +563,6 @@ static int trace_selftest_function_regs(void) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; char *func_name; int len; int ret; @@ -586,7 +579,6 @@ trace_selftest_function_regs(void) /* enable tracing, and record the filter function */ ftrace_enabled = 1; - tracer_enabled = 1; /* Handle PPC64 '.' name */ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); @@ -648,7 +640,6 @@ trace_selftest_function_regs(void) ret = 0; out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; return ret; } @@ -662,7 +653,6 @@ int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; int ret; @@ -671,7 +661,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* start the tracing */ ftrace_enabled = 1; - tracer_enabled = 1; ret = tracer_init(trace, tr); if (ret) { @@ -708,7 +697,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* kill ftrace totally if we failed */ if (ret) -- cgit v1.2.3 From 02404baf1b47123f1c88c9f9f1f3b00e1e2b10db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Nov 2012 11:51:40 -0400 Subject: tracing: Remove deprecated tracing_enabled file The tracing_enabled file was used as a quick way to stop tracers, and try to bring down overhead for things like the latency tracers (irqsoff, wakeup, etc). But it didn't work that well. The tracing_on file was created as a really fast way to stop recording into the ftrace ring buffer and can interact with the kernel. That is a tracing_off() call in the kernel can disable recording of events, and then from userspace one could echo 1 into the tracing_on file to continue it. The tracing_enabled function did too much to allow for this. The tracing_on has taken over as a way to start and stop tracing and the tracing_enabled file should not be used. But because of its existance, it still confuses people. Over a year ago the following commit was added: commit 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Author: Steven Rostedt Date: Tue Feb 8 13:54:06 2011 -0500 tracing: Deprecate tracing_enabled for tracing_on This commit added a WARN_ON() if the tracing_enabled file's variable was changed. After this was added, only LatencyTop complained, and they soon fixed their tool as there was no reason that LatencyTop should touch this file as it was using the perf ring buffers which this file does not interact with. But since that time no one else has complained about this WARN_ON(). Thus it is safe to assume that this file is no longer needed. Time to get rid of it. Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c9b96aee51a..d5cbc0d3f209 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4788,9 +4788,6 @@ static __init int tracer_init_debugfs(void) d_tracer = tracing_init_dentry(); - trace_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &rb_simple_fops); - trace_create_file("trace_options", 0644, d_tracer, NULL, &tracing_iter_fops); -- cgit v1.2.3 From 0d5c6e1c19bab82fad4837108c2902f557d62a04 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Nov 2012 20:54:21 -0400 Subject: tracing: Use irq_work for wake ups and remove *_nowake_*() functions Have the ring buffer commit function use the irq_work infrastructure to wake up any waiters waiting on the ring buffer for new data. The irq_work was created for such a purpose, where doing the actual wake up at the time of adding data is too dangerous, as an event or function trace may be in the midst of the work queue locks and cause deadlocks. The irq_work will either delay the action to the next timer interrupt, or trigger an IPI to itself forcing an interrupt to do the work (in a safe location). With irq_work, all ring buffer commits can safely do wakeups, removing the need for the ring buffer commit "nowake" variants, which were used by events and function tracing. All commits can now safely use the normal commit, and the "nowake" variants can be removed. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + kernel/trace/trace.c | 121 +++++++++++++++++++++----------------- kernel/trace/trace.h | 5 -- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_kprobe.c | 8 +-- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_selftest.c | 1 + 7 files changed, 76 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4cea4f41c1d9..5d89335a485f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -119,6 +119,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select IRQ_WORK config GENERIC_TRACER bool diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d5cbc0d3f209..37d1c703e3ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,14 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static DEFINE_PER_CPU(bool, trace_cmdline_save); +/* + * When a reader is waiting for data, then this variable is + * set to true. + */ +static bool trace_wakeup_needed; + +static struct irq_work trace_work_wakeup; + /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization @@ -329,12 +338,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); -static void wakeup_work_handler(struct work_struct *work) +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Schedules a delayed work to wake up any task that is blocked on the + * trace_wait queue. These is used with trace_poll for tasks polling the + * trace. + */ +static void trace_wake_up(struct irq_work *work) { - wake_up(&trace_wait); -} + wake_up_all(&trace_wait); -static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); +} /** * tracing_on - enable tracing buffers @@ -389,22 +404,6 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -void trace_wake_up(void) -{ - const unsigned long delay = msecs_to_jiffies(2); - - if (trace_flags & TRACE_ITER_BLOCK) - return; - schedule_delayed_work(&wakeup_work, delay); -} - static int __init set_buf_size(char *str) { unsigned long buf_size; @@ -753,6 +752,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ +static void default_wait_pipe(struct trace_iterator *iter) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + trace_wakeup_needed = true; + + if (trace_empty(iter)) + schedule(); + + finish_wait(&trace_wait, &wait); +} + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -1156,30 +1189,32 @@ void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); + if (trace_wakeup_needed) { + trace_wakeup_needed = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&trace_work_wakeup); + } ring_buffer_unlock_commit(buffer, event); } static inline void __trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) + unsigned long flags, int pc) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); - - if (wake) - trace_wake_up(); } void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); } +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, @@ -1196,29 +1231,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); -} -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); - -void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs) +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -3354,19 +3381,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } } - -void default_wait_pipe(struct trace_iterator *iter) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - - if (trace_empty(iter)) - schedule(); - - finish_wait(&trace_wait, &wait); -} - /* * This is a make-shift waitqueue. * A tracer might use this callback on some rare cases: @@ -5107,6 +5121,7 @@ __init static int tracer_alloc_buffers(void) #endif trace_init_cmdlines(); + init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); current_trace = &nop_trace; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3e8a176f64e6..55010ed175f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -327,7 +327,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); void tracing_reset_current(int cpu); @@ -349,9 +348,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long len, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -370,7 +366,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); void ftrace(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cb2df3b70f7f..880073d0b946 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1760,7 +1760,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5a3c533ef060..1865d5f76538 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Kretprobe handler */ @@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + trace_buffer_unlock_commit_regs(buffer, event, + irq_flags, pc, regs); } /* Event entry printers */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index b0a136ac382a..3374c792ccd8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -102,7 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!filter_check_discard(call, entry, buffer, event)) - trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(buffer, event, flags, pc); } static void diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 091b815f7b0a..47623169a815 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1094,6 +1094,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); + printk("ret = %d\n", ret); if (!ret) ret = trace_test_buffer(&max_tr, &count); -- cgit v1.2.3 From 7bcfaf54f591a0775254c4ea679faf615152ee3a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Nov 2012 22:56:07 -0400 Subject: tracing: Add trace_options kernel command line parameter Add trace_options to the kernel command line parameter to be able to set options at early boot. For example, to enable stack dumps of events, add the following: trace_options=stacktrace This along with the trace_event option, you can get not only traces of the events but also the stack dumps with them. Requested-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 54 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 37d1c703e3ec..c1434b5ce4d1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -155,6 +155,18 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_options __initdata; + +static int __init set_trace_boot_options(char *str) +{ + strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_boot_options = trace_boot_options_buf; + return 0; +} +__setup("trace_options=", set_trace_boot_options); + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -2838,24 +2850,14 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_printk_start_stop_comm(enabled); } -static ssize_t -tracing_trace_options_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int trace_set_options(char *option) { - char buf[64]; char *cmp; int neg = 0; - int ret; + int ret = 0; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + cmp = strstrip(option); if (strncmp(cmp, "no", 2) == 0) { neg = 1; @@ -2874,10 +2876,25 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); ret = set_tracer_option(current_trace, cmp, neg); mutex_unlock(&trace_types_lock); - if (ret) - return ret; } + return ret; +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + trace_set_options(buf); + *ppos += cnt; return cnt; @@ -5133,6 +5150,13 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + while (trace_boot_options) { + char *option; + + option = strsep(&trace_boot_options, ","); + trace_set_options(option); + } + return 0; out_free_cpumask: -- cgit v1.2.3 From 65b2c8f0e53347583168423de0f32227d8baf01b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 28 Oct 2012 16:55:36 +0100 Subject: uprobes/powerpc: Do not use arch_uprobe_*_step() helpers No functional changes. powerpc is the only user of arch_uprobe_enable/disable_step() helpers, but they should die. They can not be used correctly, every arch needs its own implementation (like x86 does). And they do not really help even as initial-and-almost-working code, arch_uprobe_*_xol() hooks can easily use user_enable/disable_single_step() directly. Change arch_uprobe_*_step() to do nothing, and convert powerpc to use ptrace helpers. This is equally wrong, powerpc needs the arch-specific fixes. Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cc4e7e42e68..abbfd8440a6d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1432,12 +1432,10 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) { - user_enable_single_step(current); } void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) { - user_disable_single_step(current); } /* -- cgit v1.2.3 From 19f5ee2716373519fda2129e9333f4c3847aa742 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 28 Oct 2012 18:14:14 +0100 Subject: uprobes: Kill arch_uprobe_enable/disable_step() hooks Kill arch_uprobe_enable/disable_step() hooks, they do nothing and nobody needs them. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index abbfd8440a6d..39c75cc51efc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1430,14 +1430,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } -void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) -{ -} - -void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) -{ -} - /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1491,7 +1483,6 @@ static void handle_swbp(struct pt_regs *regs) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) { - arch_uprobe_enable_step(&uprobe->arch); utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return; @@ -1523,7 +1514,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) else WARN_ON_ONCE(1); - arch_uprobe_disable_step(&uprobe->arch); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; -- cgit v1.2.3 From ed95779340b50e362245c81b5dec0d11a1debfa8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill cgroup_subsys->__DEPRECATED_clear_css_refs 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") removed the last user of __DEPRECATED_clear_css_refs. This patch removes __DEPRECATED_clear_css_refs and mechanisms to support it. * Conditionals dependent on __DEPRECATED_clear_css_refs removed. * cgroup_clear_css_refs() can no longer fail. All that needs to be done are deactivating refcnts, setting CSS_REMOVED and putting the base reference on each css. Remove cgroup_clear_css_refs() and the failure path, and open-code the loops into cgroup_rmdir(). This patch keeps the two for_each_subsys() loops separate while open coding them. They can be merged now but there are scheduled changes which need them to be separate, so keep them separate to reduce the amount of churn. local_irq_save/restore() from cgroup_clear_css_refs() are replaced with local_irq_disable/enable() for simplicity. This is safe as cgroup_rmdir() is always called with IRQ enabled. Note that this IRQ switching is necessary to ensure that css_tryget() isn't called from IRQ context on the same CPU while lower context is between CSS deactivation and setting CSS_REMOVED as css_tryget() would hang forever in such cases waiting for CSS to be re-activated or CSS_REMOVED set. This will go away soon. v2: cgroup_call_pre_destroy() removal dropped per Michal. Commit message updated to explain local_irq_disable/enable() conversion. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 131 ++++++++++++++++---------------------------------------- 1 file changed, 36 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..8c605e2bdd1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -865,11 +865,8 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) continue; ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); + if (WARN_ON_ONCE(ret)) break; - } } return ret; @@ -3901,14 +3898,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } /* @@ -3978,10 +3973,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); + dget(dentry); /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4066,71 +4060,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; - - local_irq_save(flags); - - /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; - } - - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } - } - - local_irq_restore(flags); - return !failed; -} - static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -4138,10 +4067,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; + struct cgroup_subsys *ss; int ret; /* the vfs holds both inode->i_mutex already */ -again: mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); @@ -4182,21 +4111,34 @@ again: return -EBUSY; } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; + + local_irq_disable(); + + /* block new css_tryget() by deactivating refcnt */ + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); + } + + /* + * Set REMOVED. All in-progress css_tryget() will be released. + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. + */ + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + set_bit(CSS_REMOVED, &css->flags); + css_put(css); } - /* NO css_tryget() can success after here. */ + + local_irq_enable(); + finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); @@ -4949,8 +4891,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); -- cgit v1.2.3 From e93160803ffda2e67d9ff9cacb63bb6868c8398f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill CSS_REMOVED CSS_REMOVED is one of the several contortions which were necessary to support css reference draining on cgroup removal. All css->refcnts which need draining should be deactivated and verified to equal zero atomically w.r.t. css_tryget(). If any one isn't zero, all refcnts needed to be re-activated and css_tryget() shouldn't fail in the process. This was achieved by letting css_tryget() busy-loop until either the refcnt is reactivated (failed removal attempt) or CSS_REMOVED is set (committing to removal). Now that css refcnt draining is no longer used, there's no need for atomic rollback mechanism. css_tryget() simply can look at the reference count and fail if it's deactivated - it's never getting re-activated. This patch removes CSS_REMOVED and updates __css_tryget() to fail if the refcnt is deactivated. As deactivation and removal are a single step now, they no longer need to be protected against css_tryget() happening from irq context. Remove local_irq_disable/enable() from cgroup_rmdir(). Note that this removes css_is_removed() whose only user is VM_BUG_ON() in memcontrol.c. We can replace it with a check on the refcnt but given that the only use case is a debug assert, I think it's better to simply unexport it. v2: Comment updated and explanation on local_irq_disable/enable() added per Michal Hocko. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c605e2bdd1a..c194f9e4fc7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -170,8 +170,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -4112,8 +4112,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - local_irq_disable(); - /* block new css_tryget() by deactivating refcnt */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -4123,21 +4121,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } /* - * Set REMOVED. All in-progress css_tryget() will be released. * Put all the base refs. Each css holds an extra reference to the * cgroup's dentry and cgroup removal proceeds regardless of css * refs. On the last put of each css, whenever that may be, the * extra dentry ref is put so that dentry destruction happens only * after all css's are released. */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } - - local_irq_enable(); + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); @@ -4861,15 +4852,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); -- cgit v1.2.3 From 976c06bcccc50573997609fa7ec842479bd96ffb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: use cgroup_lock_live_group(parent) in cgroup_create() This patch makes cgroup_create() fail if @parent is marked removed. This is to prepare for further updates to cgroup_rmdir() path. Note that this change isn't strictly necessary. cgroup can only be created via mkdir and the removed marking and dentry removal happen without releasing cgroup_mutex, so cgroup_create() can never race with cgroup_rmdir(). Even after the scheduled updates to cgroup_rmdir(), cgroup_mkdir() and cgroup_rmdir() are synchronized by i_mutex rendering the added liveliness check unnecessary. Do it anyway such that locking is contained inside cgroup proper and we don't get nasty surprises if we ever grow another caller of cgroup_create(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c194f9e4fc7b..f22e3cd1978b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3927,6 +3927,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -3934,8 +3946,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -4006,7 +4016,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free: kfree(cgrp); return err; } -- cgit v1.2.3 From 1a90dd508b0b00e382fd61a46f55dc889ac21b39 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: deactivate CSS's and mark cgroup dead before invoking ->pre_destroy() Because ->pre_destroy() could fail and can't be called under cgroup_mutex, cgroup destruction did something very ugly. 1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise. 2. Release cgroup_mutex and call ->pre_destroy(). 3. Re-grab cgroup_mutex and verify it can still be destroyed; fail otherwise. 4. Continue destroying. In addition to being ugly, it has been always broken in various ways. For example, memcg ->pre_destroy() expects the cgroup to be inactive after it's done but tasks can be attached and detached between #2 and #3 and the conditions that memcg verified in ->pre_destroy() might no longer hold by the time control reaches #3. Now that ->pre_destroy() is no longer allowed to fail. We can switch to the following. 1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise. 2. Deactivate CSS's and mark the cgroup removed thus preventing any further operations which can invalidate the verification from #1. 3. Release cgroup_mutex and call ->pre_destroy(). 4. Re-grab cgroup_mutex and continue destroying. After this change, controllers can safely assume that ->pre_destroy() will only be called only once for a given cgroup and, once ->pre_destroy() is called, the cgroup will stay dormant till it's destroyed. This removes the only reason ->pre_destroy() can fail - new task being attached or child cgroup being created inbetween. Error out path is removed and ->pre_destroy() invocation is open coded in cgroup_rmdir(). v2: cgroup_call_pre_destroy() removal moved to this patch per Michal. Commit message updated per Glauber. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 65 +++++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f22e3cd1978b..66204a6f68f3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -851,27 +851,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (WARN_ON_ONCE(ret)) - break; - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -4078,19 +4057,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - int ret; - - /* the vfs holds both inode->i_mutex already */ - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - mutex_unlock(&cgroup_mutex); /* * In general, subsystem has no css->refcnt after pre_destroy(). But @@ -4103,16 +4069,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) */ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; - } - + /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { @@ -4122,13 +4079,30 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - /* block new css_tryget() by deactivating refcnt */ + /* + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); } + set_bit(CGRP_REMOVED, &cgrp->flags); + + /* + * Tell subsystems to initate destruction. pre_destroy() should be + * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix + * potential deadlock in pre_destroy") for details. + */ + mutex_unlock(&cgroup_mutex); + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy) + WARN_ON_ONCE(ss->pre_destroy(cgrp)); + mutex_lock(&cgroup_mutex); /* * Put all the base refs. Each css holds an extra reference to the @@ -4144,7 +4118,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); -- cgit v1.2.3 From b25ed609d0eecf077db607e88ea70bae83b6adb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir() CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh --- kernel/cgroup.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66204a6f68f3..c5f6fb28dd0e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,33 +965,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } -/* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - /* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function @@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); /* * Block new css_tryget() by deactivating refcnt and mark @cgrp @@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); @@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: schedule_work(&css->dput_work); -- cgit v1.2.3 From bcf6de1b9129531215d26dd9af8331e84973bc52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: make ->pre_destroy() return void All ->pre_destory() implementations return 0 now, which is the only allowed return value. Make it return void. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh Cc: Vivek Goyal --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c5f6fb28dd0e..83cd7d041c62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4054,7 +4054,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&cgroup_mutex); for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) - WARN_ON_ONCE(ss->pre_destroy(cgrp)); + ss->pre_destroy(cgrp); mutex_lock(&cgroup_mutex); /* -- cgit v1.2.3 From 316eb661f125397d46f16f94e3c81ad3dc4c1233 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 8 Nov 2012 21:36:38 +0800 Subject: cgroup: set 'start' with the right value in cgroup_path. 'start' is set to buf + buflen and do the '--' immediately. Just set it to 'buf + buflen - 1' directly. Signed-off-by: Tao Ma Signed-off-by: Tejun Heo Cc: Li Zefan --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3070164e2036..998ab5957c6a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1770,9 +1770,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } - start = buf + buflen; + start = buf + buflen - 1; - *--start = '\0'; + *start = '\0'; for (;;) { int len = dentry->d_name.len; -- cgit v1.2.3 From 7b2e6011f150c42235c4a541d20cf6891afe878a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Oct 2012 10:54:03 -0700 Subject: rcu: Rename ->onofflock to ->orphan_lock The ->onofflock field in the rcu_state structure at one time synchronized CPU-hotplug operations for RCU. However, its scope has decreased over time so that it now only protects the lists of orphaned RCU callbacks. This commit therefore renames it to ->orphan_lock to reflect its current use. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 12 ++++++------ kernel/rcutree.h | 7 +++---- kernel/rcutree_plugin.h | 3 ++- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac8aed8ee417..89148860e2ba 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -70,7 +70,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .fqs_state = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ + .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ @@ -1573,7 +1573,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Send the specified CPU's RCU callbacks to the orphanage. The * specified CPU must be offline, and the caller must hold the - * ->onofflock. + * ->orphan_lock. */ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, @@ -1623,7 +1623,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, /* * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->onofflock. + * orphanage. The caller must hold the ->orphan_lock. */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) { @@ -1702,7 +1702,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); @@ -1729,10 +1729,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* * We still hold the leaf rcu_node structure lock here, and * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock * held leads to deadlock. */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ rnp = rdp->mynode; if (need_report & RCU_OFL_TASKS_NORM_GP) rcu_report_unblock_qs_rnp(rnp, flags); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a240f032848e..a7c945d149cf 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -383,9 +383,8 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; - /* exclude on/offline and */ - /* starting new GP. */ + raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; + /* Protect following fields. */ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ /* need a grace period. */ struct rcu_head **orphan_nxttail; /* Tail of above. */ @@ -394,7 +393,7 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ - /* End of fields guarded by onofflock. */ + /* End of fields guarded by orphan_lock. */ struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f92115488187..2b281cf0b6f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -757,7 +757,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * grace period for the specified rcu_node structure. If there are no such * tasks, report it up the rcu_node hierarchy. * - * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + * Caller must hold sync_rcu_preempt_exp_mutex and must exclude + * CPU hotplug operations. */ static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) -- cgit v1.2.3 From 1924bcb0259711eea98491a7942d1ffbf677e114 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 12:30:37 -0700 Subject: rcu: Avoid counter wrap in synchronize_sched_expedited() There is a counter scheme similar to ticket locking that synchronize_sched_expedited() uses to service multiple concurrent callers with the same expedited grace period. Upon entry, a sync_sched_expedited_started variable is atomically incremented, and upon completion of a expedited grace period a separate sync_sched_expedited_done variable is atomically incremented. However, if a synchronize_sched_expedited() is delayed while in try_stop_cpus(), concurrent invocations will increment the sync_sched_expedited_started counter, which will eventually overflow. If the original synchronize_sched_expedited() resumes execution just as the counter overflows, a concurrent invocation could incorrectly conclude that an expedited grace period elapsed in zero time, which would be bad. One could rely on counter size to prevent this from happening in practice, but the goal is to formally validate this code, so it needs to be fixed anyway. This commit therefore checks the gap between the two counters before incrementing sync_sched_expedited_started, and if the gap is too large, does a normal grace period instead. Overflow is thus only possible if there are more than about 3.5 billion threads on 32-bit systems, which can be excluded until such time as task_struct fits into a single byte and 4G/4G patches are accepted into mainline. It is also easy to encode this limitation into mechanical theorem provers. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 62 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89148860e2ba..678905555ca4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2249,8 +2249,8 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); +static atomic_long_t sync_sched_expedited_started = ATOMIC_LONG_INIT(0); +static atomic_long_t sync_sched_expedited_done = ATOMIC_LONG_INIT(0); static int synchronize_sched_expedited_cpu_stop(void *data) { @@ -2308,10 +2308,30 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { - int firstsnap, s, snap, trycount = 0; + long firstsnap, s, snap; + int trycount = 0; - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + /* + * If we are in danger of counter wrap, just do synchronize_sched(). + * By allowing sync_sched_expedited_started to advance no more than + * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring + * that more than 3.5 billion CPUs would be required to force a + * counter wrap on a 32-bit system. Quite a few more CPUs would of + * course be required on a 64-bit system. + */ + if (ULONG_CMP_GE((ulong)atomic_read(&sync_sched_expedited_started), + (ulong)atomic_read(&sync_sched_expedited_done) + + ULONG_MAX / 8)) { + synchronize_sched(); + return; + } + + /* + * Take a ticket. Note that atomic_inc_return() implies a + * full memory barrier. + */ + snap = atomic_long_inc_return(&sync_sched_expedited_started); + firstsnap = snap; get_online_cpus(); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); @@ -2324,6 +2344,13 @@ void synchronize_sched_expedited(void) NULL) == -EAGAIN) { put_online_cpus(); + /* Check to see if someone else did our work for us. */ + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + /* No joy, try again later. Or just synchronize_sched(). */ if (trycount++ < 10) { udelay(trycount * num_online_cpus()); @@ -2332,23 +2359,22 @@ void synchronize_sched_expedited(void) return; } - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + /* Recheck to see if someone else did our work for us. */ + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; } /* * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. + * callers to piggyback on our grace period. We retry + * after they started, so our grace period works for them, + * and they started after our first try, so their grace + * period works for us. */ get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); + snap = atomic_long_read(&sync_sched_expedited_started); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -2356,15 +2382,15 @@ void synchronize_sched_expedited(void) * Everyone up to our most recent fetch is covered by our grace * period. Update the counter, but only if our work is still * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. + * than we did already did their update. */ do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + s = atomic_long_read(&sync_sched_expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { smp_mb(); /* ensure test happens before caller kfree */ break; } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + } while (atomic_long_cmpxchg(&sync_sched_expedited_done, s, snap) != s); put_online_cpus(); } -- cgit v1.2.3 From 40694d6644d5cca28531707559466122eb212d8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 15:24:03 -0700 Subject: rcu: Move synchronize_sched_expedited() state to rcu_state Tracing (debugfs) of expedited RCU primitives is required, which in turn requires that the relevant data be located where the tracing code can find it, not in its current static global variables in kernel/rcutree.c. This commit therefore moves sync_sched_expedited_started and sync_sched_expedited_done to the rcu_state structure, as fields ->expedited_start and ->expedited_done, respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 20 +++++++++----------- kernel/rcutree.h | 3 +++ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 678905555ca4..3c72e5e5528c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2249,9 +2249,6 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); -static atomic_long_t sync_sched_expedited_started = ATOMIC_LONG_INIT(0); -static atomic_long_t sync_sched_expedited_done = ATOMIC_LONG_INIT(0); - static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2310,6 +2307,7 @@ void synchronize_sched_expedited(void) { long firstsnap, s, snap; int trycount = 0; + struct rcu_state *rsp = &rcu_sched_state; /* * If we are in danger of counter wrap, just do synchronize_sched(). @@ -2319,8 +2317,8 @@ void synchronize_sched_expedited(void) * counter wrap on a 32-bit system. Quite a few more CPUs would of * course be required on a 64-bit system. */ - if (ULONG_CMP_GE((ulong)atomic_read(&sync_sched_expedited_started), - (ulong)atomic_read(&sync_sched_expedited_done) + + if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), + (ulong)atomic_long_read(&rsp->expedited_done) + ULONG_MAX / 8)) { synchronize_sched(); return; @@ -2330,7 +2328,7 @@ void synchronize_sched_expedited(void) * Take a ticket. Note that atomic_inc_return() implies a * full memory barrier. */ - snap = atomic_long_inc_return(&sync_sched_expedited_started); + snap = atomic_long_inc_return(&rsp->expedited_start); firstsnap = snap; get_online_cpus(); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); @@ -2345,7 +2343,7 @@ void synchronize_sched_expedited(void) put_online_cpus(); /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; @@ -2360,7 +2358,7 @@ void synchronize_sched_expedited(void) } /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { smp_mb(); /* ensure test happens before caller kfree */ return; @@ -2374,7 +2372,7 @@ void synchronize_sched_expedited(void) * period works for us. */ get_online_cpus(); - snap = atomic_long_read(&sync_sched_expedited_started); + snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } @@ -2385,12 +2383,12 @@ void synchronize_sched_expedited(void) * than we did already did their update. */ do { - s = atomic_long_read(&sync_sched_expedited_done); + s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { smp_mb(); /* ensure test happens before caller kfree */ break; } - } while (atomic_long_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); put_online_cpus(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a7c945d149cf..88f3d9d5971d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -404,6 +404,9 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + atomic_long_t expedited_start; /* Starting ticket. */ + atomic_long_t expedited_done; /* Done ticket. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ -- cgit v1.2.3 From a30489c5228fba6f16b4c740a0292879ef13371e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 16:18:09 -0700 Subject: rcu: Instrument synchronize_rcu_expedited() for debugfs tracing This commit adds the counters to rcu_state and updates them in synchronize_rcu_expedited() to provide the data needed for debugfs tracing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 +++++++++++++++--- kernel/rcutree.h | 9 +++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3c72e5e5528c..b966d56ebb51 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2321,6 +2321,7 @@ void synchronize_sched_expedited(void) (ulong)atomic_long_read(&rsp->expedited_done) + ULONG_MAX / 8)) { synchronize_sched(); + atomic_long_inc(&rsp->expedited_wrap); return; } @@ -2341,11 +2342,14 @@ void synchronize_sched_expedited(void) synchronize_sched_expedited_cpu_stop, NULL) == -EAGAIN) { put_online_cpus(); + atomic_long_inc(&rsp->expedited_tryfail); /* Check to see if someone else did our work for us. */ s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone1); return; } @@ -2354,13 +2358,16 @@ void synchronize_sched_expedited(void) udelay(trycount * num_online_cpus()); } else { synchronize_sched(); + atomic_long_inc(&rsp->expedited_normal); return; } /* Recheck to see if someone else did our work for us. */ s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone2); return; } @@ -2375,6 +2382,7 @@ void synchronize_sched_expedited(void) snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } + atomic_long_inc(&rsp->expedited_stoppedcpus); /* * Everyone up to our most recent fetch is covered by our grace @@ -2383,12 +2391,16 @@ void synchronize_sched_expedited(void) * than we did already did their update. */ do { + atomic_long_inc(&rsp->expedited_done_tries); s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_done_lost); break; } } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); + atomic_long_inc(&rsp->expedited_done_exit); put_online_cpus(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 88f3d9d5971d..d274af357210 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -406,6 +406,15 @@ struct rcu_state { atomic_long_t expedited_start; /* Starting ticket. */ atomic_long_t expedited_done; /* Done ticket. */ + atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ + atomic_long_t expedited_done_tries; /* # tries to update _done. */ + atomic_long_t expedited_done_lost; /* # times beaten to _done. */ + atomic_long_t expedited_done_exit; /* # times exited _done loop. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ -- cgit v1.2.3 From 573bcd40d221bd6d7cebf27dee120bd242f5feb5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:02 +0800 Subject: rcu: Create directory for each flavor of rcu This patch will create subdirectory according to each flavor of rcu, the new structure will be: /debugfs/rcu/ -> rsp_0 -> rsp_1 -> ... So we can go to '/debugfs/rcu/rsp_0' and get the cpu info of rsp_0 there. The flavors of RCU are currently rcu_bh, rcu_preempt, and rcu_sched. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 693513bc50e6..62223a27f985 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -446,12 +446,20 @@ static struct dentry *rcudir; static int __init rcutree_trace_init(void) { + struct rcu_state *rsp; struct dentry *retval; + struct dentry *rspdir; rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) goto free_out; + for_each_rcu_flavor(rsp) { + rspdir = debugfs_create_dir(rsp->name, rcudir); + if (!rspdir) + goto free_out; + } + retval = debugfs_create_file("rcubarrier", 0444, rcudir, NULL, &rcubarrier_fops); if (!retval) -- cgit v1.2.3 From 374b928ee8061fdbb0b527fb3924080ba2437767 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:03 +0800 Subject: rcu: Fundamental facility for 'CPU units sequence reading' This patch add the fundamental facility used by the following patches, so we can implement the 'CPU units sequence reading' later. This helps us avoid losing data when there are too many CPUs and too small of a buffer, since this new approach allows userspace to read out the data one CPU at a time. Thus, if the buffer is not large enough, userspace will get whatever CPUs fit, and can then issue another read for the remainder of the data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 62223a27f985..0dfe9b512f0f 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,36 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +static int r_open(struct inode *inode, struct file *file, + const struct seq_operations *op) +{ + int ret = seq_open(file, op); + if (!ret) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + *pos = cpumask_next(*pos - 1, cpu_possible_mask); + if ((*pos) < nr_cpu_ids) + return per_cpu_ptr(rsp->rda, *pos); + return NULL; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return r_start(m, pos); +} + +static void r_stop(struct seq_file *m, void *v) +{ +} + static int show_rcubarrier(struct seq_file *m, void *unused) { struct rcu_state *rsp; -- cgit v1.2.3 From 878eda72e24d11e463a25b1dc7097a8d953f17cb Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:04 +0800 Subject: rcu: Optimize the 'rcudata' for RCU trace This patch implements the new 'rcudata' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0dfe9b512f0f..a11522f62c71 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -174,6 +174,32 @@ static const struct file_operations rcudata_fops = { .release = single_release, }; +static int new_show_rcudata(struct seq_file *m, void *v) +{ + print_one_rcu_data(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations new_rcudate_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcudata, +}; + +static int new_rcudata_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcudate_op); +} + +static const struct file_operations new_rcudata_fops = { + .owner = THIS_MODULE, + .open = new_rcudata_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -488,6 +514,11 @@ static int __init rcutree_trace_init(void) rspdir = debugfs_create_dir(rsp->name, rcudir); if (!rspdir) goto free_out; + + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &new_rcudata_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From d29200efa2ad7a1dc516a1048faf98dcc01b9fef Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:05 +0800 Subject: rcu: Optimize the 'rcudata.csv' for RCU trace This patch implements the new 'rcudata.csv' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index a11522f62c71..e387a642632b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -267,6 +267,43 @@ static const struct file_operations rcudata_csv_fops = { .release = single_release, }; +static int new_show_rcudata_csv(struct seq_file *m, void *v) +{ + struct rcu_data *rdp = (struct rcu_data *)v; + if (cpumask_first(cpu_possible_mask) == rdp->cpu) { + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); + seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); +#ifdef CONFIG_RCU_BOOST + seq_puts(m, "\"kt\",\"ktl\""); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); + } + + print_one_rcu_data_csv(m, rdp); + return 0; +} + +static const struct seq_operations new_rcudate_csv_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcudata_csv, +}; + +static int new_rcudata_csv_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcudate_csv_op); +} + +static const struct file_operations new_rcudata_csv_fops = { + .owner = THIS_MODULE, + .open = new_rcudata_csv_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -519,6 +556,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcudata_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcudata.csv", 0444, + rspdir, rsp, &new_rcudata_csv_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From 51d0f16d49f6a99189e80c50e18a12325664ef41 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:06 +0800 Subject: rcu: Optimize the 'rcu_pending' for RCU trace This patch implements the new 'rcu_pending' interface under each rsp directory, by using the 'CPU units sequence reading', thus avoiding loss of tracing data. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index e387a642632b..8b2486722211 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -467,6 +467,8 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { + if (!rdp->beenonline) + return; seq_printf(m, "%3d%cnp=%ld ", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', @@ -485,16 +487,12 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) static int show_rcu_pending(struct seq_file *m, void *unused) { int cpu; - struct rcu_data *rdp; struct rcu_state *rsp; for_each_rcu_flavor(rsp) { seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->beenonline) - print_one_rcu_pending(m, rdp); - } + for_each_possible_cpu(cpu) + print_one_rcu_pending(m, per_cpu_ptr(rsp->rda, cpu)); } return 0; } @@ -512,6 +510,32 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int new_show_rcu_pending(struct seq_file *m, void *v) +{ + print_one_rcu_pending(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations new_rcu_pending_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = new_show_rcu_pending, +}; + +static int new_rcu_pending_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &new_rcu_pending_op); +} + +static const struct file_operations new_rcu_pending_fops = { + .owner = THIS_MODULE, + .open = new_rcu_pending_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static int show_rcutorture(struct seq_file *m, void *unused) { seq_printf(m, "rcutorture test sequence: %lu %s\n", @@ -561,6 +585,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcudata_csv_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &new_rcu_pending_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From c011c41f11c4bf9b0c8d489a458770c24aeb2ebd Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:07 +0800 Subject: rcu: Replace the old interface with the new one This patch removed the old RCU debugfs interface and replaced it with the new one. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 148 ++++++++----------------------------------------- 1 file changed, 24 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 8b2486722211..0e2ab6427b64 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -148,53 +148,27 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -static int show_rcudata(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata, NULL); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcudata(struct seq_file *m, void *v) +static int show_rcudata(struct seq_file *m, void *v) { print_one_rcu_data(m, (struct rcu_data *)v); return 0; } -static const struct seq_operations new_rcudate_op = { +static const struct seq_operations rcudate_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcudata, + .show = show_rcudata, }; -static int new_rcudata_open(struct inode *inode, struct file *file) +static int rcudata_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcudate_op); + return r_open(inode, file, &rcudate_op); } -static const struct file_operations new_rcudata_fops = { +static const struct file_operations rcudata_fops = { .owner = THIS_MODULE, - .open = new_rcudata_open, + .open = rcudata_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -234,40 +208,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -static int show_rcudata_csv(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); - for_each_rcu_flavor(rsp) { - seq_printf(m, "\"%s:\"\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcudata_csv_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcudata_csv, NULL); -} - -static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcudata_csv(struct seq_file *m, void *v) +static int show_rcudata_csv(struct seq_file *m, void *v) { struct rcu_data *rdp = (struct rcu_data *)v; if (cpumask_first(cpu_possible_mask) == rdp->cpu) { @@ -284,21 +225,21 @@ static int new_show_rcudata_csv(struct seq_file *m, void *v) return 0; } -static const struct seq_operations new_rcudate_csv_op = { +static const struct seq_operations rcudate_csv_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcudata_csv, + .show = show_rcudata_csv, }; -static int new_rcudata_csv_open(struct inode *inode, struct file *file) +static int rcudata_csv_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcudate_csv_op); + return r_open(inode, file, &rcudate_csv_op); } -static const struct file_operations new_rcudata_csv_fops = { +static const struct file_operations rcudata_csv_fops = { .owner = THIS_MODULE, - .open = new_rcudata_csv_open, + .open = rcudata_csv_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -484,53 +425,27 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_need_nothing); } -static int show_rcu_pending(struct seq_file *m, void *unused) -{ - int cpu; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - seq_printf(m, "%s:\n", rsp->name); - for_each_possible_cpu(cpu) - print_one_rcu_pending(m, per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_pending, NULL); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcu_pending(struct seq_file *m, void *v) +static int show_rcu_pending(struct seq_file *m, void *v) { print_one_rcu_pending(m, (struct rcu_data *)v); return 0; } -static const struct seq_operations new_rcu_pending_op = { +static const struct seq_operations rcu_pending_op = { .start = r_start, .next = r_next, .stop = r_stop, - .show = new_show_rcu_pending, + .show = show_rcu_pending, }; -static int new_rcu_pending_open(struct inode *inode, struct file *file) +static int rcu_pending_open(struct inode *inode, struct file *file) { - return r_open(inode, file, &new_rcu_pending_op); + return r_open(inode, file, &rcu_pending_op); } -static const struct file_operations new_rcu_pending_fops = { +static const struct file_operations rcu_pending_fops = { .owner = THIS_MODULE, - .open = new_rcu_pending_open, + .open = rcu_pending_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -577,17 +492,17 @@ static int __init rcutree_trace_init(void) goto free_out; retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &new_rcudata_fops); + rspdir, rsp, &rcudata_fops); if (!retval) goto free_out; retval = debugfs_create_file("rcudata.csv", 0444, - rspdir, rsp, &new_rcudata_csv_fops); + rspdir, rsp, &rcudata_csv_fops); if (!retval) goto free_out; retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &new_rcu_pending_fops); + rspdir, rsp, &rcu_pending_fops); if (!retval) goto free_out; } @@ -597,16 +512,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcudata.csv", 0444, rcudir, - NULL, &rcudata_csv_fops); - if (!retval) - goto free_out; - if (rcu_boost_trace_create_file(rcudir)) goto free_out; @@ -619,11 +524,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, rcudir, - NULL, &rcu_pending_fops); - if (!retval) - goto free_out; - retval = debugfs_create_file("rcutorture", 0444, rcudir, NULL, &rcutorture_fops); if (!retval) -- cgit v1.2.3 From 5f4ee1fa16fa1bee673b75722ca43350a74fba36 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 Sep 2012 08:51:08 +0800 Subject: rcu: Remove the interface "rcudata.csv" This patch removes the interface "rcudata.csv" since it is apparently not used. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 76 -------------------------------------------------- 1 file changed, 76 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0e2ab6427b64..bcc4865fea8b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -174,77 +174,6 @@ static const struct file_operations rcudata_fops = { .release = seq_release, }; -static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%d,%s,%lu,%lu,%d,%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", - rdp->completed, rdp->gpnum, - rdp->passed_quiesce, rdp->qs_pending); - seq_printf(m, ",%d,%llx,%d,%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, ",%lu", rdp->offline_fqs); - seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, ",%d,\"%c\"", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu))); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, ",%ld", rdp->blimit); - seq_printf(m, ",%lu,%lu,%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata_csv(struct seq_file *m, void *v) -{ - struct rcu_data *rdp = (struct rcu_data *)v; - if (cpumask_first(cpu_possible_mask) == rdp->cpu) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); -#ifdef CONFIG_RCU_BOOST - seq_puts(m, "\"kt\",\"ktl\""); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); - } - - print_one_rcu_data_csv(m, rdp); - return 0; -} - -static const struct seq_operations rcudate_csv_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata_csv, -}; - -static int rcudata_csv_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_csv_op); -} - -static const struct file_operations rcudata_csv_fops = { - .owner = THIS_MODULE, - .open = rcudata_csv_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -496,11 +425,6 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; - retval = debugfs_create_file("rcudata.csv", 0444, - rspdir, rsp, &rcudata_csv_fops); - if (!retval) - goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, rspdir, rsp, &rcu_pending_fops); if (!retval) -- cgit v1.2.3 From 42c3533eee88e012e1aa3c4d6d2cc53354130e24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Sep 2012 10:49:58 -0700 Subject: rcu: Fix tracing formatting The rcu_state structure's ->completed field is unsigned long, so this commit adjusts show_one_rcugp()'s printf() format to suit. Also add the required ACCESS_ONCE() directives while we are in this function. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 4 ++-- kernel/rcutree_trace.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b966d56ebb51..8ed9c481db03 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -68,8 +68,8 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ - .gpnum = -300, \ - .completed = -300, \ + .gpnum = 0UL - 300UL, \ + .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index bcc4865fea8b..3312ed7e411e 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +#define ulong2long(a) (*(long *)(&(a))) + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -116,10 +118,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->completed, rdp->gpnum, + ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->passed_quiesce, rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), @@ -246,8 +248,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ", - rsp->name, rsp->completed, gpnum, rsp->fqs_state, + seq_printf(m, "%s: c=%ld g=%ld s=%d jfq=%ld j=%x ", + rsp->name, ulong2long(rsp->completed), ulong2long(gpnum), + rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", @@ -301,16 +304,16 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp = &rsp->node[0]; raw_spin_lock_irqsave(&rnp->lock, flags); - completed = rsp->completed; - gpnum = rsp->gpnum; - if (rsp->completed == rsp->gpnum) + completed = ACCESS_ONCE(rsp->completed); + gpnum = ACCESS_ONCE(rsp->gpnum); + if (completed == gpnum) gpage = 0; else gpage = jiffies - rsp->gp_start; gpmax = rsp->gp_max; raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", - rsp->name, completed, gpnum, gpage, gpmax); + seq_printf(m, "%s: completed=%ld gpnum=%ld age=%ld max=%ld\n", + rsp->name, ulong2long(completed), ulong2long(gpnum), gpage, gpmax); } static int show_rcugp(struct seq_file *m, void *unused) -- cgit v1.2.3 From c25e557f5d49a7cb94fad473f5ced75b6c7ce094 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:16 +0800 Subject: rcu: split 'rcubarrier' to each flavor This patch add new 'rcubarrier' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcubarrier' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3312ed7e411e..65b6265531ff 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -103,6 +103,28 @@ static const struct file_operations rcubarrier_fops = { .release = single_release, }; +static int new_show_rcubarrier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + seq_printf(m, "bcc: %d nbd: %lu\n", + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); + return 0; +} + +static int new_rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcubarrier, inode->i_private); +} + +static const struct file_operations new_rcubarrier_fops = { + .owner = THIS_MODULE, + .open = new_rcubarrier_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static char convert_kthread_status(unsigned int kthread_status) @@ -432,6 +454,11 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &new_rcubarrier_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From 29c67764f121a0980eb30d0314821ea631e6cfaf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:17 +0800 Subject: rcu: split 'rcuboost' to each flavor This patch add new 'rcuboost' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcuboost' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 65b6265531ff..cae417de4b94 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -241,7 +241,7 @@ static const struct file_operations rcu_node_boost_fops = { .owner = THIS_MODULE, .open = rcu_node_boost_open, .read = seq_read, - .llseek = seq_lseek, + .llseek = no_llseek, .release = single_release, }; @@ -459,6 +459,15 @@ static int __init rcutree_trace_init(void) rspdir, rsp, &new_rcubarrier_fops); if (!retval) goto free_out; + +#ifdef CONFIG_RCU_BOOST + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); + if (!retval) + goto free_out; + } +#endif } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From 66b38bc52bd1e0d00987e23bf7153c46201ff2ba Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:18 +0800 Subject: rcu: split 'rcugp' to each flavor This patch add new 'rcugp' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcugp' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index cae417de4b94..c90d2a917def 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -360,6 +360,26 @@ static const struct file_operations rcugp_fops = { .release = single_release, }; +static int new_show_rcugp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); + return 0; +} + +static int new_rcugp_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcugp, inode->i_private); +} + +static const struct file_operations new_rcugp_fops = { + .owner = THIS_MODULE, + .open = new_rcugp_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -468,6 +488,12 @@ static int __init rcutree_trace_init(void) goto free_out; } #endif + + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &new_rcugp_fops); + if (!retval) + goto free_out; + } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From a608d84bdb832a86ad3fdb0767df31fcda9fe280 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 8 Oct 2012 16:59:19 +0800 Subject: rcu: split 'rcuhier' to each flavor This patch add new 'rcuhier' to each flavor's folder, now we could use: 'cat /debugfs/rcu/rsp/rcuhier' to get the selected rsp info. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c90d2a917def..107997ecdeeb 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -316,6 +316,26 @@ static const struct file_operations rcuhier_fops = { .release = single_release, }; +static int new_show_rcuhier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); + return 0; +} + +static int new_rcuhier_open(struct inode *inode, struct file *file) +{ + return single_open(file, new_show_rcuhier, inode->i_private); +} + +static const struct file_operations new_rcuhier_fops = { + .owner = THIS_MODULE, + .open = new_rcuhier_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) { unsigned long flags; @@ -494,6 +514,10 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &new_rcuhier_fops); + if (!retval) + goto free_out; } retval = debugfs_create_file("rcubarrier", 0444, rcudir, -- cgit v1.2.3 From 6ee0886ff6c81526bf6ad8521d843b934aafa5aa Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 11 Oct 2012 14:26:42 -0700 Subject: rcu: Remove old debugfs interfaces and also RCU flavor name This commit removes the old debugfs interfaces, so that the new directory-per-RCU-flavor versions remain. Because the RCU flavor is given by the directory name, there is no need to print it out, so remove the name from the printout. Signed-off-by: Michael Wang Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 190 ++++++++++++------------------------------------- 1 file changed, 44 insertions(+), 146 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 107997ecdeeb..967115c508bc 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -78,32 +78,7 @@ static void r_stop(struct seq_file *m, void *v) { } -static int show_rcubarrier(struct seq_file *m, void *unused) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - seq_printf(m, "%s: bcc: %d nbd: %lu\n", - rsp->name, - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, NULL); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcubarrier(struct seq_file *m, void *v) +static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; seq_printf(m, "bcc: %d nbd: %lu\n", @@ -112,14 +87,14 @@ static int new_show_rcubarrier(struct seq_file *m, void *v) return 0; } -static int new_rcubarrier_open(struct inode *inode, struct file *file) +static int rcubarrier_open(struct inode *inode, struct file *file) { - return single_open(file, new_show_rcubarrier, inode->i_private); + return single_open(file, show_rcubarrier, inode->i_private); } -static const struct file_operations new_rcubarrier_fops = { +static const struct file_operations rcubarrier_fops = { .owner = THIS_MODULE, - .open = new_rcubarrier_open, + .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, .release = seq_release, @@ -245,23 +220,7 @@ static const struct file_operations rcu_node_boost_fops = { .release = single_release, }; -/* - * Create the rcuboost debugfs entry. Standard error return. - */ -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, - &rcu_node_boost_fops); -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static int rcu_boost_trace_create_file(struct dentry *rcudir) -{ - return 0; /* There cannot be an error if we didn't create it! */ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_BOOST */ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { @@ -270,8 +229,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "%s: c=%ld g=%ld s=%d jfq=%ld j=%x ", - rsp->name, ulong2long(rsp->completed), ulong2long(gpnum), + seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", + ulong2long(rsp->completed), ulong2long(gpnum), rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); @@ -294,44 +253,22 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); } -static int show_rcuhier(struct seq_file *m, void *unused) +static int show_rcuhier(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - print_one_rcu_state(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); return 0; } static int rcuhier_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcuhier, NULL); + return single_open(file, show_rcuhier, inode->i_private); } static const struct file_operations rcuhier_fops = { .owner = THIS_MODULE, .open = rcuhier_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int new_rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, new_show_rcuhier, inode->i_private); -} - -static const struct file_operations new_rcuhier_fops = { - .owner = THIS_MODULE, - .open = new_rcuhier_open, - .read = seq_read, .llseek = no_llseek, .release = seq_release, }; @@ -354,48 +291,26 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) gpage = jiffies - rsp->gp_start; gpmax = rsp->gp_max; raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "%s: completed=%ld gpnum=%ld age=%ld max=%ld\n", - rsp->name, ulong2long(completed), ulong2long(gpnum), gpage, gpmax); + seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", + ulong2long(completed), ulong2long(gpnum), gpage, gpmax); } -static int show_rcugp(struct seq_file *m, void *unused) +static int show_rcugp(struct seq_file *m, void *v) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - show_one_rcugp(m, rsp); + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); return 0; } static int rcugp_open(struct inode *inode, struct file *file) { - return single_open(file, show_rcugp, NULL); + return single_open(file, show_rcugp, inode->i_private); } static const struct file_operations rcugp_fops = { .owner = THIS_MODULE, .open = rcugp_open, .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int new_show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int new_rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, new_show_rcugp, inode->i_private); -} - -static const struct file_operations new_rcugp_fops = { - .owner = THIS_MODULE, - .open = new_rcugp_open, - .read = seq_read, .llseek = no_llseek, .release = seq_release, }; @@ -485,57 +400,40 @@ static int __init rcutree_trace_init(void) if (!rspdir) goto free_out; - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &rcudata_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &rcu_pending_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &new_rcubarrier_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &rcubarrier_fops); + if (!retval) + goto free_out; #ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &new_rcugp_fops); + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); if (!retval) goto free_out; + } +#endif - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &new_rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcubarrier", 0444, rcudir, - NULL, &rcubarrier_fops); - if (!retval) - goto free_out; - - if (rcu_boost_trace_create_file(rcudir)) - goto free_out; - - retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &rcugp_fops); + if (!retval) + goto free_out; - retval = debugfs_create_file("rcuhier", 0444, rcudir, - NULL, &rcuhier_fops); - if (!retval) - goto free_out; + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &rcuhier_fops); + if (!retval) + goto free_out; + } retval = debugfs_create_file("rcutorture", 0444, rcudir, NULL, &rcutorture_fops); -- cgit v1.2.3 From 7bd8f2a74bcbd39f4277766f4d49441c45dd10a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Oct 2012 17:14:32 -0700 Subject: rcu: Add tracing for synchronize_sched_expedited() This commit adds a per-RCU-flavor "rcuexp" file that dumps out statistics for synchonize_sched_expedited(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_trace.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 967115c508bc..f9512687a6e5 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -173,6 +173,38 @@ static const struct file_operations rcudata_fops = { .release = seq_release, }; +static int show_rcuexp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + + seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", + atomic_long_read(&rsp->expedited_start), + atomic_long_read(&rsp->expedited_done), + atomic_long_read(&rsp->expedited_wrap), + atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone1), + atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_normal), + atomic_long_read(&rsp->expedited_stoppedcpus), + atomic_long_read(&rsp->expedited_done_tries), + atomic_long_read(&rsp->expedited_done_lost), + atomic_long_read(&rsp->expedited_done_exit)); + return 0; +} + +static int rcuexp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuexp, inode->i_private); +} + +static const struct file_operations rcuexp_fops = { + .owner = THIS_MODULE, + .open = rcuexp_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + #ifdef CONFIG_RCU_BOOST static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) @@ -405,6 +437,11 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + retval = debugfs_create_file("rcuexp", 0444, + rspdir, rsp, &rcuexp_fops); + if (!retval) + goto free_out; + retval = debugfs_create_file("rcu_pending", 0444, rspdir, rsp, &rcu_pending_fops); if (!retval) -- cgit v1.2.3 From a8638030f668884720b8f4456448d0ce33952b05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: add cgroup_subsys->post_create() Currently, there's no way for a controller to find out whether a new cgroup finished all ->create() allocatinos successfully and is considered "live" by cgroup. This becomes a problem later when we add generic descendants walking to cgroup which can be used by controllers as controllers don't have a synchronization point where it can synchronize against new cgroups appearing in such walks. This patch adds ->post_create(). It's called after all ->create() succeeded and the cgroup is linked into the generic cgroup hierarchy. This plays the counterpart of ->pre_destroy(). When used in combination with the to-be-added generic descendant iterators, ->post_create() can be used to implement reliable state inheritance. It will be explained with the descendant iterators. v2: Added a paragraph about its future use w/ descendant iterators per Michal. v3: Forgot to add ->post_create() invocation to cgroup_load_subsys(). Fixed. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 998ab5957c6a..26f2edbaf4f5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4059,10 +4059,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* each css holds a ref to the cgroup's dentry */ - for_each_subsys(root, ss) + for_each_subsys(root, ss) { + /* each css holds a ref to the cgroup's dentry */ dget(dentry); + /* creation succeeded, notify subsystems */ + if (ss->post_create) + ss->post_create(cgrp); + } + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4280,6 +4285,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) ss->active = 1; + if (ss->post_create) + ss->post_create(&ss->root->top_cgroup); + /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ BUG_ON(ss->module); @@ -4389,6 +4397,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->active = 1; + if (ss->post_create) + ss->post_create(&ss->root->top_cgroup); + /* success! */ mutex_unlock(&cgroup_mutex); return 0; -- cgit v1.2.3 From eb6fd5040ee799009173daa49c3e7aa0362167c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: use rculist ops for cgroup->children Use RCU safe list operations for cgroup->children. This will be used to implement cgroup children / descendant walking which can be used by controllers. Note that cgroup_create() now puts a new cgroup at the end of the ->children list instead of head. This isn't strictly necessary but is done so that the iteration order is more conventional. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 26f2edbaf4f5..2ed5968a04e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1650,7 +1650,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&root_cgrp->sibling)); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); @@ -1699,7 +1698,6 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -4052,7 +4050,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - list_add(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4083,7 +4081,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - list_del(&cgrp->sibling); + list_del_rcu(&cgrp->sibling); root->number_of_cgroups--; err_destroy: @@ -4209,7 +4207,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) raw_spin_unlock(&release_list_lock); /* delete this cgroup from parent->children */ - list_del_init(&cgrp->sibling); + list_del_rcu(&cgrp->sibling); list_del_init(&cgrp->allcg_node); -- cgit v1.2.3 From 574bd9f7c7c1d32f52dea5488018a6ff79031e22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: implement generic child / descendant walk macros Currently, cgroup doesn't provide any generic helper for walking a given cgroup's children or descendants. This patch adds the following three macros. * cgroup_for_each_child() - walk immediate children of a cgroup. * cgroup_for_each_descendant_pre() - visit all descendants of a cgroup in pre-order tree traversal. * cgroup_for_each_descendant_post() - visit all descendants of a cgroup in post-order tree traversal. All three only require the user to hold RCU read lock during traversal. Verifying that each iterated cgroup is online is the responsibility of the user. When used with proper synchronization, cgroup_for_each_descendant_pre() can be used to propagate state updates to descendants in reliable way. See comments for details. v2: s/config/state/ in commit message and comments per Michal. More documentation on synchronization rules. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cgroup.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ed5968a04e7..0f8fa6aa371b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2984,6 +2984,92 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_pre(). Find the next + * descendant to visit for pre-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, pretend we just visited @cgroup */ + if (!pos) { + if (list_empty(&cgroup->children)) + return NULL; + pos = cgroup; + } + + /* visit the first child if exists */ + next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + do { + next = list_entry_rcu(pos->sibling.next, struct cgroup, + sibling); + if (&next->sibling != &pos->parent->children) + return next; + + pos = pos->parent; + } while (pos != cgroup); + + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); + +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +{ + struct cgroup *last; + + do { + last = pos; + pos = list_first_or_null_rcu(&pos->children, struct cgroup, + sibling); + } while (pos); + + return last; +} + +/** + * cgroup_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_post(). Find the next + * descendant to visit for post-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit the leftmost descendant */ + if (!pos) { + next = cgroup_leftmost_descendant(cgroup); + return next != cgroup ? next : NULL; + } + + /* if there's an unvisited sibling, visit its leftmost descendant */ + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return cgroup_leftmost_descendant(next); + + /* no sibling left, visit parent */ + next = pos->parent; + return next != cgroup ? next : NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit v1.2.3 From bcd66c894ac994ef5d6fbbaa9a24506ffaf64fd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup_freezer: trivial cleanups * Clean-up indentation and line-breaks. Drop the invalid comment about freezer->lock. * Make all internal functions take @freezer instead of both @cgroup and @freezer. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index bedefd9a22df..975b3d8732f4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -29,17 +29,15 @@ enum freezer_state { }; struct freezer { - struct cgroup_subsys_state css; - enum freezer_state state; - spinlock_t lock; /* protects _writes_ to state */ + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; }; -static inline struct freezer *cgroup_freezer( - struct cgroup *cgroup) +static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of( - cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); + return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) @@ -180,8 +178,9 @@ out: * migrated into or out of @cgroup, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) +static void update_if_frozen(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -211,12 +210,11 @@ notyet: static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(cgroup); enum freezer_state state; - freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); - update_if_frozen(cgroup, freezer); + update_if_frozen(freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); @@ -225,8 +223,9 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, return 0; } -static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -236,8 +235,9 @@ static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void unfreeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -247,11 +247,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void freezer_change_state(struct cgroup *cgroup, +static void freezer_change_state(struct freezer *freezer, enum freezer_state goal_state) { - struct freezer *freezer = cgroup_freezer(cgroup); - /* also synchronizes against task migration, see freezer_attach() */ spin_lock_irq(&freezer->lock); @@ -260,13 +258,13 @@ static void freezer_change_state(struct cgroup *cgroup, if (freezer->state != CGROUP_THAWED) atomic_dec(&system_freezing_cnt); freezer->state = CGROUP_THAWED; - unfreeze_cgroup(cgroup, freezer); + unfreeze_cgroup(freezer); break; case CGROUP_FROZEN: if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; - freeze_cgroup(cgroup, freezer); + freeze_cgroup(freezer); break; default: BUG(); @@ -275,8 +273,7 @@ static void freezer_change_state(struct cgroup *cgroup, spin_unlock_irq(&freezer->lock); } -static int freezer_write(struct cgroup *cgroup, - struct cftype *cft, +static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { enum freezer_state goal_state; @@ -288,7 +285,7 @@ static int freezer_write(struct cgroup *cgroup, else return -EINVAL; - freezer_change_state(cgroup, goal_state); + freezer_change_state(cgroup_freezer(cgroup), goal_state); return 0; } -- cgit v1.2.3 From 04a4ec32571e88c614b229824afec376d24cc035 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: prepare freezer_change_state() for full hierarchy support * Make freezer_change_state() take bool @freeze instead of enum freezer_state. * Separate out freezer_apply_state() out of freezer_change_state(). This makes freezer_change_state() a rather silly thin wrapper. It will be filled with hierarchy handling later on. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 975b3d8732f4..2690830e7428 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -247,45 +247,57 @@ static void unfreeze_cgroup(struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void freezer_change_state(struct freezer *freezer, - enum freezer_state goal_state) +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze) { /* also synchronizes against task migration, see freezer_attach() */ - spin_lock_irq(&freezer->lock); + lockdep_assert_held(&freezer->lock); - switch (goal_state) { - case CGROUP_THAWED: - if (freezer->state != CGROUP_THAWED) - atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; - unfreeze_cgroup(freezer); - break; - case CGROUP_FROZEN: + if (freeze) { if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; freeze_cgroup(freezer); - break; - default: - BUG(); + } else { + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + freezer->state = CGROUP_THAWED; + unfreeze_cgroup(freezer); } +} +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @cgroup according to @freeze. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + /* update @freezer */ + spin_lock_irq(&freezer->lock); + freezer_apply_state(freezer, freeze); spin_unlock_irq(&freezer->lock); } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - enum freezer_state goal_state; + bool freeze; if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) - goal_state = CGROUP_THAWED; + freeze = false; else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) - goal_state = CGROUP_FROZEN; + freeze = true; else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), goal_state); + freezer_change_state(cgroup_freezer(cgroup), freeze); return 0; } -- cgit v1.2.3 From d6a2fe134219adf94e6bd0c8f6e2a3163ff68c41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: make freezer->state mask of flags freezer->state was an enum value - one of THAWED, FREEZING and FROZEN. As the scheduled full hierarchy support requires more than one freezing condition, switch it to mask of flags. If FREEZING is not set, it's thawed. FREEZING is set if freezing or frozen. If frozen, both FREEZING and FROZEN are set. Now that tasks can be attached to an already frozen cgroup, this also makes freezing condition checks more natural. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 60 ++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2690830e7428..e76aa9fb3ef4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,15 +22,14 @@ #include #include -enum freezer_state { - CGROUP_THAWED = 0, - CGROUP_FREEZING, - CGROUP_FROZEN, +enum freezer_state_flags { + CGROUP_FREEZING = (1 << 1), /* this freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ }; struct freezer { struct cgroup_subsys_state css; - enum freezer_state state; + unsigned int state; spinlock_t lock; }; @@ -48,12 +47,10 @@ static inline struct freezer *task_freezer(struct task_struct *task) bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state; bool ret; rcu_read_lock(); - state = task_freezer(task)->state; - ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -63,10 +60,13 @@ bool cgroup_freezing(struct task_struct *task) * cgroups_write_string() limits the size of freezer state strings to * CGROUP_LOCAL_BUFFER_SIZE */ -static const char *freezer_state_strs[] = { - "THAWED", - "FREEZING", - "FROZEN", +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; }; /* @@ -91,7 +91,6 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); spin_lock_init(&freezer->lock); - freezer->state = CGROUP_THAWED; return &freezer->css; } @@ -99,7 +98,7 @@ static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); kfree(freezer); } @@ -129,15 +128,13 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) * Tasks in @tset are on @new_cgrp but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - * This means that, to determine whether to freeze, one should test - * whether the state equals THAWED. */ cgroup_taskset_for_each(task, new_cgrp, tset) { - if (freezer->state == CGROUP_THAWED) { + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state = CGROUP_FREEZING; + freezer->state &= ~CGROUP_FROZEN; } } @@ -159,11 +156,7 @@ static void freezer_fork(struct task_struct *task) goto out; spin_lock_irq(&freezer->lock); - /* - * @task might have been just migrated into a FROZEN cgroup. Test - * equality with THAWED. Read the comment in freezer_attach(). - */ - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) freeze_task(task); spin_unlock_irq(&freezer->lock); out: @@ -184,7 +177,8 @@ static void update_if_frozen(struct freezer *freezer) struct cgroup_iter it; struct task_struct *task; - if (freezer->state != CGROUP_FREEZING) + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) return; cgroup_iter_start(cgroup, &it); @@ -202,7 +196,7 @@ static void update_if_frozen(struct freezer *freezer) } } - freezer->state = CGROUP_FROZEN; + freezer->state |= CGROUP_FROZEN; notyet: cgroup_iter_end(cgroup, &it); } @@ -211,14 +205,14 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { struct freezer *freezer = cgroup_freezer(cgroup); - enum freezer_state state; + unsigned int state; spin_lock_irq(&freezer->lock); update_if_frozen(freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); - seq_puts(m, freezer_state_strs[state]); + seq_puts(m, freezer_state_strs(state)); seq_putc(m, '\n'); return 0; } @@ -258,14 +252,14 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze) lockdep_assert_held(&freezer->lock); if (freeze) { - if (freezer->state == CGROUP_THAWED) + if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); - freezer->state = CGROUP_FREEZING; + freezer->state |= CGROUP_FREEZING; freeze_cgroup(freezer); } else { - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; + freezer->state &= ~(CGROUP_FREEZING | CGROUP_FROZEN); unfreeze_cgroup(freezer); } } @@ -290,9 +284,9 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, { bool freeze; - if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + if (strcmp(buffer, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; -- cgit v1.2.3 From a225218060fc8f10ed396c9c8187074697ad044d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: introduce CGROUP_FREEZING_[SELF|PARENT] Introduce FREEZING_SELF and FREEZING_PARENT and make FREEZING OR of the two flags. This is to prepare for full hierarchy support. freezer_apply_date() is updated such that it can handle setting and clearing of both flags. The two flags are also exposed to userland via read-only files self_freezing and parent_freezing. Other than the added cgroupfs files, this patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 55 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e76aa9fb3ef4..b8ad93c6f5f6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -23,8 +23,12 @@ #include enum freezer_state_flags { - CGROUP_FREEZING = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { @@ -245,8 +249,13 @@ static void unfreeze_cgroup(struct freezer *freezer) * freezer_apply_state - apply state change to a single cgroup_freezer * @freezer: freezer to apply state change to * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. */ -static void freezer_apply_state(struct freezer *freezer, bool freeze) +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer->lock); @@ -254,13 +263,19 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze) if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); - freezer->state |= CGROUP_FREEZING; + freezer->state |= state; freeze_cgroup(freezer); } else { - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~(CGROUP_FREEZING | CGROUP_FROZEN); - unfreeze_cgroup(freezer); + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } } } @@ -275,7 +290,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { /* update @freezer */ spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze); + freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); spin_unlock_irq(&freezer->lock); } @@ -295,6 +310,20 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, return 0; } +static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + static struct cftype files[] = { { .name = "state", @@ -302,6 +331,16 @@ static struct cftype files[] = { .read_seq_string = freezer_read, .write_string = freezer_write, }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, { } /* terminate */ }; -- cgit v1.2.3 From 5300a9b3482b6d9c32de6d5f4eaeab0fbafa70a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: add ->post_create() and ->pre_destroy() and track online state A cgroup is online and visible to iteration between ->post_create() and ->pre_destroy(). This patch introduces CGROUP_FREEZER_ONLINE and toggles it from the newly added freezer_post_create() and freezer_pre_destroy() while holding freezer->lock such that a cgroup_freezer can be reilably distinguished to be online. This will be used by full hierarchy support. ONLINE test is added to freezer_apply_state() but it currently doesn't make any difference as freezer_write() can only be called for an online cgroup. Adjusting system_freezing_cnt on destruction is moved from freezer_destroy() to the new freezer_pre_destroy() for consistency. This patch doesn't introduce any noticeable behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b8ad93c6f5f6..4f12d317c4c3 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -23,6 +23,7 @@ #include enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ @@ -98,13 +99,45 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) return &freezer->css; } -static void freezer_destroy(struct cgroup *cgroup) +/** + * freezer_post_create - commit creation of a freezer cgroup + * @cgroup: cgroup being created + * + * We're committing to creation of @cgroup. Mark it online. + */ +static void freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + freezer->state |= CGROUP_FREEZER_ONLINE; + spin_unlock_irq(&freezer->lock); +} + +/** + * freezer_pre_destroy - initiate destruction of @cgroup + * @cgroup: cgroup being destroyed + * + * @cgroup is going away. Mark it dead and decrement system_freezing_count + * if it was holding one. + */ +static void freezer_pre_destroy(struct cgroup *cgroup) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + spin_lock_irq(&freezer->lock); + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); - kfree(freezer); + + freezer->state = 0; + + spin_unlock_irq(&freezer->lock); +} + +static void freezer_destroy(struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); } /* @@ -260,6 +293,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer->lock); + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); @@ -347,6 +383,8 @@ static struct cftype files[] = { struct cgroup_subsys freezer_subsys = { .name = "freezer", .create = freezer_create, + .post_create = freezer_post_create, + .pre_destroy = freezer_pre_destroy, .destroy = freezer_destroy, .subsys_id = freezer_subsys_id, .attach = freezer_attach, -- cgit v1.2.3 From ef9fe980c6fcc1821ab955b74b242d2d6585fa75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: implement proper hierarchy support Up until now, cgroup_freezer didn't implement hierarchy properly. cgroups could be arranged in hierarchy but it didn't make any difference in how each cgroup_freezer behaved. They all operated separately. This patch implements proper hierarchy support. If a cgroup is frozen, all its descendants are frozen. A cgroup is thawed iff it and all its ancestors are THAWED. freezer.self_freezing shows the current freezing state for the cgroup itself. freezer.parent_freezing shows whether the cgroup is freezing because any of its ancestors is freezing. freezer_post_create() locks the parent and new cgroup and inherits the parent's state and freezer_change_state() applies new state top-down using cgroup_for_each_descendant_pre() which guarantees that no child can escape its parent's state. update_if_frozen() uses cgroup_for_each_descendant_post() to propagate frozen states bottom-up. Synchronization could be coarser and easier by using a single mutex to protect all hierarchy operations. Finer grained approach was used because it wasn't too difficult for cgroup_freezer and I think it's beneficial to have an example implementation and cgroup_freezer is rather simple and can serve a good one. As this makes cgroup_freezer properly hierarchical, freezer_subsys.broken_hierarchy marking is removed. Note that this patch changes userland visible behavior - freezing a cgroup now freezes all its descendants too. This behavior change is intended and has been warned via .broken_hierarchy. v2: Michal spotted a bug in freezer_change_state() - descendants were inheriting from the wrong ancestor. Fixed. v3: Documentation/cgroups/freezer-subsystem.txt updated. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 161 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 123 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 4f12d317c4c3..670a4af7dc94 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,6 +22,13 @@ #include #include +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ enum freezer_state_flags { CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ @@ -50,6 +57,15 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } +static struct freezer *parent_freezer(struct freezer *freezer) +{ + struct cgroup *pcg = freezer->css.cgroup->parent; + + if (pcg) + return cgroup_freezer(pcg); + return NULL; +} + bool cgroup_freezing(struct task_struct *task) { bool ret; @@ -74,17 +90,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -/* - * State diagram - * Transitions are caused by userspace writes to the freezer.state file. - * The values in parenthesis are state labels. The rest are edge labels. - * - * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______THAWED_______/ | - * \__________________________THAWED____________/ - */ - struct cgroup_subsys freezer_subsys; static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) @@ -103,15 +108,34 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) * freezer_post_create - commit creation of a freezer cgroup * @cgroup: cgroup being created * - * We're committing to creation of @cgroup. Mark it online. + * We're committing to creation of @cgroup. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. */ static void freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *parent = parent_freezer(freezer); + + /* + * The following double locking and freezing state inheritance + * guarantee that @cgroup can never escape ancestors' freezing + * states. See cgroup_for_each_descendant_pre() for details. + */ + if (parent) + spin_lock_irq(&parent->lock); + spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); - spin_lock_irq(&freezer->lock); freezer->state |= CGROUP_FREEZER_ONLINE; - spin_unlock_irq(&freezer->lock); + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + spin_unlock(&freezer->lock); + if (parent) + spin_unlock_irq(&parent->lock); } /** @@ -153,6 +177,7 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + bool clear_frozen = false; spin_lock_irq(&freezer->lock); @@ -172,10 +197,25 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) } else { freeze_task(task); freezer->state &= ~CGROUP_FROZEN; + clear_frozen = true; } } spin_unlock_irq(&freezer->lock); + + /* + * Propagate FROZEN clearing upwards. We may race with + * update_if_frozen(), but as long as both work bottom-up, either + * update_if_frozen() sees child's FROZEN cleared or we clear the + * parent's FROZEN later. No parent w/ !FROZEN children can be + * left FROZEN. + */ + while (clear_frozen && (freezer = parent_freezer(freezer))) { + spin_lock_irq(&freezer->lock); + freezer->state &= ~CGROUP_FROZEN; + clear_frozen = freezer->state & CGROUP_FREEZING; + spin_unlock_irq(&freezer->lock); + } } static void freezer_fork(struct task_struct *task) @@ -200,24 +240,47 @@ out: rcu_read_unlock(); } -/* - * We change from FREEZING to FROZEN lazily if the cgroup was only - * partially frozen when we exitted write. Caller must hold freezer->lock. +/** + * update_if_frozen - update whether a cgroup finished freezing + * @cgroup: cgroup of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being * migrated into or out of @cgroup, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup) { - struct cgroup *cgroup = freezer->css.cgroup; + struct freezer *freezer = cgroup_freezer(cgroup); + struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; + WARN_ON_ONCE(!rcu_read_lock_held()); + + spin_lock_irq(&freezer->lock); + if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) - return; + goto out_unlock; + + /* are all (live) children frozen? */ + cgroup_for_each_child(pos, cgroup) { + struct freezer *child = cgroup_freezer(pos); + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) + goto out_unlock; + } + + /* are all tasks frozen? */ cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { @@ -229,27 +292,32 @@ static void update_if_frozen(struct freezer *freezer) * the usual frozen condition. */ if (!frozen(task) && !freezer_should_skip(task)) - goto notyet; + goto out_iter_end; } } freezer->state |= CGROUP_FROZEN; -notyet: +out_iter_end: cgroup_iter_end(cgroup, &it); +out_unlock: + spin_unlock_irq(&freezer->lock); } static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { - struct freezer *freezer = cgroup_freezer(cgroup); - unsigned int state; + struct cgroup *pos; - spin_lock_irq(&freezer->lock); - update_if_frozen(freezer); - state = freezer->state; - spin_unlock_irq(&freezer->lock); + rcu_read_lock(); - seq_puts(m, freezer_state_strs(state)); + /* update states bottom-up */ + cgroup_for_each_descendant_post(pos, cgroup) + update_if_frozen(pos); + update_if_frozen(cgroup); + + rcu_read_unlock(); + + seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); seq_putc(m, '\n'); return 0; } @@ -320,14 +388,39 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, * @freezer: freezer of interest * @freeze: whether to freeze or thaw * - * Freeze or thaw @cgroup according to @freeze. + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. */ static void freezer_change_state(struct freezer *freezer, bool freeze) { + struct cgroup *pos; + /* update @freezer */ spin_lock_irq(&freezer->lock); freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); spin_unlock_irq(&freezer->lock); + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + rcu_read_lock(); + cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { + struct freezer *pos_f = cgroup_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + /* + * Our update to @parent->state is already visible which is + * all we need. No need to lock @parent. For more info on + * synchronization, see freezer_post_create(). + */ + spin_lock_irq(&pos_f->lock); + freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + spin_unlock_irq(&pos_f->lock); + } + rcu_read_unlock(); } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, @@ -390,12 +483,4 @@ struct cgroup_subsys freezer_subsys = { .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, - - /* - * freezer subsys doesn't handle hierarchy at all. Frozen state - * should be inherited through the hierarchy - if a parent is - * frozen, all its children should be frozen. Fix it and remove - * the following. - */ - .broken_hierarchy = true, }; -- cgit v1.2.3 From 04aa530ec04f61875b99c12721162e2964e3318c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 3 Nov 2012 11:52:09 +0100 Subject: genirq: Always force thread affinity Sankara reported that the genirq core code fails to adjust the affinity of an interrupt thread in several cases: 1) On request/setup_irq() the call to setup_affinity() happens before the new action is registered, so the new thread is not notified. 2) For secondary shared interrupts nothing notifies the new thread to change its affinity. 3) Interrupts which have the IRQ_NO_BALANCE flag set are not moving the thread either. Fix this by setting the thread affinity flag right on thread creation time. This ensures that under all circumstances the thread moves to the right place. Requires a check in irq_thread_check_affinity for an existing affinity mask (CONFIG_CPU_MASK_OFFSTACK=y) Reported-and-tested-by: Sankara Muthukrishnan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1209041738200.2754@ionos Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1cbd572f6ad8..35c70c9e24d8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -732,6 +732,7 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; + bool valid = true; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -746,10 +747,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->irq_data.affinity); + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (desc->irq_data.affinity) + cpumask_copy(mask, desc->irq_data.affinity); + else + valid = false; raw_spin_unlock_irq(&desc->lock); - set_cpus_allowed_ptr(current, mask); + if (valid) + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -954,6 +963,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; + /* + * Tell the thread to set its affinity. This is + * important for shared interrupt handlers as we do + * not invoke setup_affinity() for the secondary + * handlers as everything is already set up. Even for + * interrupts marked with IRQF_NO_BALANCE this is + * correct as we want the thread to move to the cpu(s) + * on which the requesting code placed the interrupt. + */ + set_bit(IRQTF_AFFINITY, &new->thread_flags); } if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { -- cgit v1.2.3 From f95a985781e9e986992351c971af7f7e46e06ed5 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 18 Oct 2012 11:34:41 +0200 Subject: time/jiffies: Make clocksource_jiffies static Commit f1b8274 ("clocksource: Cleanup clocksource selection") removed all external references to clocksource_jiffies so there is no need to have the symbol globally visible. Fixes the following sparse warning: CHECK kernel/time/jiffies.c kernel/time/jiffies.c:61:20: warning: symbol 'clocksource_jiffies' was not declared. Should it be static? Signed-off-by: Lars-Peter Clausen Signed-off-by: John Stultz --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 6629bf7b5285..25f5b2699d37 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs) return (cycle_t) jiffies; } -struct clocksource clocksource_jiffies = { +static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, -- cgit v1.2.3 From d6ad418763888f617ac5b4849823e4cd670df1dd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 28 Feb 2012 16:50:11 -0800 Subject: time: Kill xtime_lock, replacing it with jiffies_lock Now that timekeeping is protected by its own locks, rename the xtime_lock to jifffies_lock to better describe what it protects. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/jiffies.c | 6 ++++-- kernel/time/tick-common.c | 8 ++++---- kernel/time/tick-internal.h | 1 - kernel/time/tick-sched.c | 22 +++++++++++----------- kernel/time/timekeeping.c | 14 +++----------- 5 files changed, 22 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 25f5b2699d37..7a925ba456fb 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -67,6 +67,8 @@ static struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { @@ -74,9 +76,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da6c9ecad4e4..b1600a6973f4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -63,13 +63,13 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } update_process_times(user_mode(get_irq_regs())); @@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); next = tick_next_period; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e265b901fed..cf3e59ed6dc0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif extern void do_timer(unsigned long ticks); -extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..a678046c3e5e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* - * The time, when the last jiffy update happened. Protected by xtime_lock. + * The time, when the last jiffy update happened. Protected by jiffies_lock. */ static ktime_t last_jiffies_update; @@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta; /* - * Do a quick check without holding xtime_lock: + * Do a quick check without holding jiffies_lock: */ delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with xtime_lock held */ - write_seqlock(&xtime_lock); + /* Reevalute with jiffies_lock held */ + write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta.tv64 >= tick_period.tv64) { @@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } /* @@ -89,12 +89,12 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update.tv64 == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); return period; } @@ -282,11 +282,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { @@ -658,7 +658,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * concurrency: This happens only when the cpu in charge went * into a long sleep. If two cpus happen to assign themself to * this duty, then the jiffies update is still serialized by - * xtime_lock. + * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; @@ -810,7 +810,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * concurrency: This happens only when the cpu in charge went * into a long sleep. If two cpus happen to assign themself to * this duty, then the jiffies update is still serialized by - * xtime_lock. + * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..4c7de02eacdc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,12 +25,6 @@ static struct timekeeper timekeeper; -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -1299,9 +1293,7 @@ struct timespec get_monotonic_coarse(void) } /* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... + * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { @@ -1389,7 +1381,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); */ void xtime_update(unsigned long ticks) { - write_seqlock(&xtime_lock); + write_seqlock(&jiffies_lock); do_timer(ticks); - write_sequnlock(&xtime_lock); + write_sequnlock(&jiffies_lock); } -- cgit v1.2.3 From 8cbd9cc6254065c97c4bac42daa55ba1abe73a8e Mon Sep 17 00:00:00 2001 From: David Sharp Date: Tue, 13 Nov 2012 12:18:21 -0800 Subject: tracing,x86: Add a TSC trace_clock In order to promote interoperability between userspace tracers and ftrace, add a trace_clock that reports raw TSC values which will then be recorded in the ring buffer. Userspace tracers that also record TSCs are then on exactly the same time base as the kernel and events can be unambiguously interlaced. Tested: Enabled a tracepoint and the "tsc" trace_clock and saw very large timestamp values. v2: Move arch-specific bits out of generic code. v3: Rename "x86-tsc", cleanups v7: Generic arch bits in Kbuild. Google-Bug-Id: 6980623 Link: http://lkml.kernel.org/r/1352837903-32191-1-git-send-email-dhsharp@google.com Acked-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c1434b5ce4d1..0d20620c0d27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -488,6 +488,7 @@ static struct { { trace_clock_local, "local" }, { trace_clock_global, "global" }, { trace_clock_counter, "counter" }, + ARCH_TRACE_CLOCKS }; int trace_clock_id; -- cgit v1.2.3 From 8be0709f10e3dd5d7d07933ad61a9f18c4b93ca5 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Tue, 13 Nov 2012 12:18:22 -0800 Subject: tracing: Format non-nanosec times from tsc clock without a decimal point. With the addition of the "tsc" clock, formatting timestamps to look like fractional seconds is misleading. Mark clocks as either in nanoseconds or not, and format non-nanosecond timestamps as decimal integers. Tested: $ cd /sys/kernel/debug/tracing/ $ cat trace_clock [local] global tsc $ echo sched_switch > set_event $ echo 1 > tracing_on ; sleep 0.0005 ; echo 0 > tracing_on $ cat trace -0 [000] 6330.555552: sched_switch: prev_comm=swapper prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=29964 next_prio=120 sleep-29964 [000] 6330.555628: sched_switch: prev_comm=bash prev_pid=29964 prev_prio=120 prev_state=S ==> next_comm=swapper next_pid=0 next_prio=120 ... $ echo 1 > options/latency-format $ cat trace -0 0 4104553247us+: sched_switch: prev_comm=swapper prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=29964 next_prio=120 sleep-29964 0 4104553322us+: sched_switch: prev_comm=bash prev_pid=29964 prev_prio=120 prev_state=S ==> next_comm=swapper next_pid=0 next_prio=120 ... $ echo tsc > trace_clock $ cat trace $ echo 1 > tracing_on ; sleep 0.0005 ; echo 0 > tracing_on $ echo 0 > options/latency-format $ cat trace -0 [000] 16490053398357: sched_switch: prev_comm=swapper prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=31128 next_prio=120 sleep-31128 [000] 16490053588518: sched_switch: prev_comm=bash prev_pid=31128 prev_prio=120 prev_state=S ==> next_comm=swapper next_pid=0 next_prio=120 ... echo 1 > options/latency-format $ cat trace -0 0 91557653238+: sched_switch: prev_comm=swapper prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=31128 next_prio=120 sleep-31128 0 91557843399+: sched_switch: prev_comm=bash prev_pid=31128 prev_prio=120 prev_state=S ==> next_comm=swapper next_pid=0 next_prio=120 ... v2: Move arch-specific bits out of generic code. v4: Fix x86_32 build due to 64-bit division. Google-Bug-Id: 6980623 Link: http://lkml.kernel.org/r/1352837903-32191-2-git-send-email-dhsharp@google.com Cc: Masami Hiramatsu Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 +++++++-- kernel/trace/trace.h | 4 --- kernel/trace/trace_output.c | 78 ++++++++++++++++++++++++++++++--------------- 3 files changed, 65 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0d20620c0d27..d943e69569cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -484,10 +484,11 @@ static const char *trace_options[] = { static struct { u64 (*func)(void); const char *name; + int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local" }, - { trace_clock_global, "global" }, - { trace_clock_counter, "counter" }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, ARCH_TRACE_CLOCKS }; @@ -2478,6 +2479,10 @@ __tracing_open(struct inode *inode, struct file *file) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[trace_clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + /* stop the trace while dumping */ tracing_stop(); @@ -3339,6 +3344,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[trace_clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55010ed175f0..c75d7988902c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -406,10 +406,6 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; extern cpumask_var_t __read_mostly tracing_buffer_mask; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 123b189c732c..194d79602dc7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh = 100; +static unsigned long preempt_mark_thresh_us = 100; static int -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, - rel_usecs > preempt_mark_thresh ? '!' : - rel_usecs > 1 ? '+' : ' '); + unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; + unsigned long long abs_ts = iter->ts - iter->tr->time_start; + unsigned long long rel_ts = next_ts - iter->ts; + struct trace_seq *s = &iter->seq; + + if (in_ns) { + abs_ts = ns2usecs(abs_ts); + rel_ts = ns2usecs(rel_ts); + } + + if (verbose && in_ns) { + unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); + unsigned long abs_msec = (unsigned long)abs_ts; + unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); + unsigned long rel_msec = (unsigned long)rel_ts; + + return trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { + return trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { + return trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + rel_ts > preempt_mark_thresh_us ? '!' : + rel_ts > 1 ? '+' : ' '); + } else { /* !verbose && !in_ns */ + return trace_seq_printf(s, " %4lld: ", abs_ts); + } } int trace_print_context(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned long secs = (unsigned long)t; + unsigned long long t; + unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; int ret; @@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter) return 0; } - return trace_seq_printf(s, " %5lu.%06lu: ", - secs, usec_rem); + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(iter->ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + } else + return trace_seq_printf(s, " %12llu: ", iter->ts); } int trace_print_lat_context(struct trace_iterator *iter) @@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter) *next_entry = trace_find_next_entry(iter, NULL, &next_ts); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); - unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - unsigned long rel_usecs; /* Restore the original ent_size */ iter->ent_size = ent_size; if (!next_entry) next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); if (verbose) { char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" - " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx, - ns2usecs(iter->ts), - abs_usecs / USEC_PER_MSEC, - abs_usecs % USEC_PER_MSEC, - rel_usecs / USEC_PER_MSEC, - rel_usecs % USEC_PER_MSEC); + ret = trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { ret = lat_print_generic(s, entry, iter->cpu); - if (ret) - ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } + if (ret) + ret = lat_print_timestamp(iter, next_ts); + return ret; } -- cgit v1.2.3 From 11043d8b125671a32253cddb0b05177be0e976f6 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 13 Nov 2012 12:18:23 -0800 Subject: tracing: Show raw time stamp on stats per cpu using counter or tsc mode for trace_clock Show raw time stamp values for stats per cpu if you choose counter or tsc mode for trace_clock. Although a unit of tracing time stamp is nsec in local or global mode, the units in counter and TSC mode are tracing counter and cycles respectively. Link: http://lkml.kernel.org/r/1352837903-32191-3-git-send-email-dhsharp@google.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d943e69569cd..b69cc380322d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4388,13 +4388,24 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + if (trace_clocks[trace_clock_id].in_ns) { + /* local or global for trace_clock */ + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", + t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + } else { + /* counter or tsc mode for trace_clock */ + trace_seq_printf(s, "oldest event ts: %llu\n", + ring_buffer_oldest_event_ts(tr->buffer, cpu)); - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + trace_seq_printf(s, "now ts: %llu\n", + ring_buffer_time_stamp(tr->buffer, cpu)); + } cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); -- cgit v1.2.3 From 878d7439d0f45a95869e417576774673d1fa243f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Oct 2012 04:55:36 -0700 Subject: rcu: Fix batch-limit size problem Commit 29c00b4a1d9e27 (rcu: Add event-tracing for RCU callback invocation) added a regression in rcu_do_batch() Under stress, RCU is supposed to allow to process all items in queue, instead of a batch of 10 items (blimit), but an integer overflow makes the effective limit being 1. So, unless there is frequent idle periods (during which RCU ignores batch limits), RCU can be forced into a state where it cannot keep up with the callback-generation rate, eventually resulting in OOM. This commit therefore converts a few variables in rcu_do_batch() from int to long to fix this problem, along with the module parameters controlling the batch limits. Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Cc: # 3.2 + --- kernel/rcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 93d6871bf7f9..e4c2192b47c8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -212,13 +212,13 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif }; -static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static int qhimark = 10000; /* If this many pending, ignore blimit. */ -static int qlowmark = 100; /* Once only this many pending, use blimit. */ +static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ +static long qhimark = 10000; /* If this many pending, ignore blimit. */ +static long qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0444); -module_param(qhimark, int, 0444); -module_param(qlowmark, int, 0444); +module_param(blimit, long, 0444); +module_param(qhimark, long, 0444); +module_param(qlowmark, long, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; @@ -1791,7 +1791,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy, i; + long bl, count, count_lazy; + int i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { -- cgit v1.2.3 From 67afeed2cab0e59712b4ebf1aef9a2e555a188ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Oct 2012 12:56:06 -0700 Subject: rcu: Add new rcutorture module parameters to start/end test messages Several new rcutorture module parameters have been added, but are not printed to the console at the beginning and end of tests, which makes it difficult to reproduce a prior test. This commit therefore adds these new module parameters to the list printed at the beginning and the end of the tests. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index aaa7b9f3532a..7fa184f6a48f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1396,12 +1396,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, onoff_interval, onoff_holdoff); } -- cgit v1.2.3 From f0a0e6f282c72247e7c8ec17c68d528c1bb4d49e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Oct 2012 13:47:01 -0700 Subject: rcu: Clarify memory-ordering properties of grace-period primitives This commit explicitly states the memory-ordering properties of the RCU grace-period primitives. Although these properties were in some sense implied by the fundmental property of RCU ("a grace period must wait for all pre-existing RCU read-side critical sections to complete"), stating it explicitly will be a great labor-saving device. Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Oleg Nesterov --- kernel/rcutree.c | 29 +++++++++++++++++++++++++---- kernel/rcutree_plugin.h | 8 ++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e4c2192b47c8..15a2beec320f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2228,10 +2228,28 @@ static inline int rcu_blocking_is_gp(void) * rcu_read_lock_sched(). * * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. + * non-threaded hardware-interrupt handlers, in progress on entry will + * have completed before this primitive returns. However, this does not + * guarantee that softirq handlers will have completed, since in some + * kernels, these handlers can run in process context, and can block. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_sched() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-sched read-side critical section whose beginning + * preceded the call to synchronize_sched(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_sched() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_sched() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_sched(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). * * This primitive provides the guarantees made by the (now removed) * synchronize_kernel() API. In contrast, synchronize_rcu() only @@ -2259,6 +2277,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), * and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu_bh(void) { diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f92115488187..57e0ef8ed721 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -670,6 +670,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. */ void synchronize_rcu(void) { @@ -875,6 +878,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. */ void rcu_barrier(void) { -- cgit v1.2.3 From 351573a86d0ef17cbba1c5436706602692781bfe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2012 04:52:56 -0700 Subject: rcu: Fix TINY_RCU rcu_is_cpu_rrupt_from_idle check The rcu_is_cpu_rrupt_from_idle() needs to allow for one interrupt level from the idle loop, but TINY_RCU checks for a call from the idle loop itself. This commit fixes this issue. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e4c6a598d6f7..e7dce58f9c2a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -195,7 +195,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); */ int rcu_is_cpu_rrupt_from_idle(void) { - return rcu_dynticks_nesting <= 0; + return rcu_dynticks_nesting <= 1; } /* -- cgit v1.2.3 From 65b6ecc03838fd263cf7fafdfa6cf13012b91d56 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 14 Nov 2012 18:27:07 +0100 Subject: uprobes: Flush cache after xol write Flush the cache so that the instructions written to the XOL area are visible. Signed-off-by: Rabin Vincent Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 39c75cc51efc..5ce99cfd2e6e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1199,6 +1199,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on supported architectures too. + */ + flush_dcache_page(area->page); return current->utask->xol_vaddr; } -- cgit v1.2.3 From 6e32d479db6079dd5d4309aa66aecbcf2664a5fe Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:43 -0800 Subject: kernel/cpu.c: Add comment for priority in cpu_hotplug_pm_callback cpu_hotplug_pm_callback should have higher priority than bsp_pm_callback which depends on cpu_hotplug_pm_callback to disable cpu hotplug to avoid race during bsp online checking. This is to hightlight the priorities between the two callbacks in case people may overlook the order. Ideally the priorities should be defined in macro/enum instead of fixed values. To do that, a seperate patchset may be pushed which will touch serveral other generic files and is out of scope of this patchset. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-7-git-send-email-fenghua.yu@intel.com Reviewed-by: Srivatsa S. Bhat Acked-by: Rafael J. Wysocki Signed-off-by: H. Peter Anvin --- kernel/cpu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 42bd331ee0ab..a2491a2d62bb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -601,6 +601,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, static int __init cpu_hotplug_pm_sync_init(void) { + /* + * cpu_hotplug_pm_callback has higher priority than x86 + * bsp_pm_callback which depends on cpu_hotplug_pm_callback + * to disable cpu hotplug to avoid cpu hotplug race. + */ pm_notifier(cpu_hotplug_pm_callback, 0); return 0; } -- cgit v1.2.3 From 5e5041f3527b36b58e864886ba34c179ad40ff92 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 23 Oct 2012 01:30:54 +0200 Subject: ACPI / processor: prevent cpu from becoming online Even if acpi_processor_handle_eject() offlines cpu, there is a chance to online the cpu after that. So the patch closes the window by using get/put_online_cpus(). Why does the patch change _cpu_up() logic? The patch cares the race of hot-remove cpu and _cpu_up(). If the patch does not change it, there is the following race. hot-remove cpu | _cpu_up() ------------------------------------- ------------------------------------ call acpi_processor_handle_eject() | call cpu_down() | call get_online_cpus() | | call cpu_hotplug_begin() and stop here call arch_unregister_cpu() | call acpi_unmap_lsapic() | call put_online_cpus() | | start and continue _cpu_up() return acpi_processor_remove() | continue hot-remove the cpu | So _cpu_up() can continue to itself. And hot-remove cpu can also continue itself. If the patch changes _cpu_up() logic, the race disappears as below: hot-remove cpu | _cpu_up() ----------------------------------------------------------------------- call acpi_processor_handle_eject() | call cpu_down() | call get_online_cpus() | | call cpu_hotplug_begin() and stop here call arch_unregister_cpu() | call acpi_unmap_lsapic() | cpu's cpu_present is set | to false by set_cpu_present()| call put_online_cpus() | | start _cpu_up() | check cpu_present() and return -EINVAL return acpi_processor_remove() | continue hot-remove the cpu | Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Srivatsa S. Bhat Reviewed-by: Toshi Kani Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 42bd331ee0ab..f45657f1eb8e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -348,11 +348,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct task_struct *idle; - if (cpu_online(cpu) || !cpu_present(cpu)) - return -EINVAL; - cpu_hotplug_begin(); + if (cpu_online(cpu) || !cpu_present(cpu)) { + ret = -EINVAL; + goto out; + } + idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); -- cgit v1.2.3 From 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Fri, 26 Oct 2012 12:26:41 +0200 Subject: cpuidle: Quickly notice prediction failure for repeat mode The prediction for future is difficult and when the cpuidle governor prediction fails and govenor possibly choose the shallower C-state than it should. How to quickly notice and find the failure becomes important for power saving. cpuidle menu governor has a method to predict the repeat pattern if there are 8 C-states residency which are continuous and the same or very close, so it will predict the next C-states residency will keep same residency time. There is a real case that turbostat utility (tools/power/x86/turbostat) at kernel 3.3 or early. turbostat utility will read 10 registers one by one at Sandybridge, so it will generate 10 IPIs to wake up idle CPUs. So cpuidle menu governor will predict it is repeat mode and there is another IPI wake up idle CPU soon, so it keeps idle CPU stay at C1 state even though CPU is totally idle. However, in the turbostat, following 10 registers reading is sleep 5 seconds by default, so the idle CPU will keep at C1 for a long time though it is idle until break event occurs. In a idle Sandybridge system, run "./turbostat -v", we will notice that deep C-state dangles between "70% ~ 99%". After patched the kernel, we will notice deep C-state stays at >99.98%. In the patch, a timer is added when menu governor detects a repeat mode and choose a shallow C-state. The timer is set to a time out value that greater than predicted time, and we conclude repeat mode prediction failure if timer is triggered. When repeat mode happens as expected, the timer is not triggered and CPU waken up from C-states and it will cancel the timer initiatively. When repeat mode does not happen, the timer will be time out and menu governor will quickly notice that the repeat mode prediction fails and then re-evaluates deeper C-states possibility. Below is another case which will clearly show the patch much benefit: #include #include #include #include #include #include #include volatile int * shutdown; volatile long * count; int delay = 20; int loop = 8; void usage(void) { fprintf(stderr, "Usage: idle_predict [options]\n" " --help -h Print this help\n" " --thread -n Thread number\n" " --loop -l Loop times in shallow Cstate\n" " --delay -t Sleep time (uS)in shallow Cstate\n"); } void *simple_loop() { int idle_num = 1; while (!(*shutdown)) { *count = *count + 1; if (idle_num % loop) usleep(delay); else { /* sleep 1 second */ usleep(1000000); idle_num = 0; } idle_num++; } } static void sighand(int sig) { *shutdown = 1; } int main(int argc, char *argv[]) { sigset_t sigset; int signum = SIGALRM; int i, c, er = 0, thread_num = 8; pthread_t pt[1024]; static char optstr[] = "n:l:t:h:"; while ((c = getopt(argc, argv, optstr)) != EOF) switch (c) { case 'n': thread_num = atoi(optarg); break; case 'l': loop = atoi(optarg); break; case 't': delay = atoi(optarg); break; case 'h': default: usage(); exit(1); } printf("thread=%d,loop=%d,delay=%d\n",thread_num,loop,delay); count = malloc(sizeof(long)); shutdown = malloc(sizeof(int)); *count = 0; *shutdown = 0; sigemptyset(&sigset); sigaddset(&sigset, signum); sigprocmask (SIG_BLOCK, &sigset, NULL); signal(SIGINT, sighand); signal(SIGTERM, sighand); for(i = 0; i < thread_num ; i++) pthread_create(&pt[i], NULL, simple_loop, NULL); for (i = 0; i < thread_num; i++) pthread_join(pt[i], NULL); exit(0); } Get powertop V2 from git://github.com/fenrus75/powertop, build powertop. After build the above test application, then run it. Test plaform can be Intel Sandybridge or other recent platforms. #./idle_predict -l 10 & #./powertop We will find that deep C-state will dangle between 40%~100% and much time spent on C1 state. It is because menu governor wrongly predict that repeat mode is kept, so it will choose the C1 shallow C-state even though it has chance to sleep 1 second in deep C-state. While after patched the kernel, we find that deep C-state will keep >99.6%. Signed-off-by: Rik van Riel Signed-off-by: Youquan Song Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a40260885265..6f337068dc4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -526,6 +526,8 @@ void tick_nohz_irq_exit(void) if (!ts->inidle) return; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); __tick_nohz_idle_enter(ts); } @@ -621,6 +623,8 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); -- cgit v1.2.3 From 883ee4f79d636d13f24c2479c66d42cb652b0239 Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Tue, 23 Oct 2012 01:20:35 +0200 Subject: PM / sysfs: replace strict_str* with kstrto* Replace strict_strtoul() with kstrtoul() in pm_async_store() and pm_qos_power_write(). [rjw: Modified subject and changelog.] Signed-off-by: Daniel Walter Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- kernel/power/qos.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index f458238109cc..1c16f9167de1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, { unsigned long val; - if (strict_strtoul(buf, 10, &val)) + if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 846bd42c7ed1..4da05cee81bf 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -500,7 +500,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else { ascii_value[count] = '\0'; } - ret = strict_strtoul(ascii_value, 16, &ulval); + ret = kstrtoul(ascii_value, 16, &ulval); if (ret) { pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); return -EINVAL; -- cgit v1.2.3 From 8316bd72c0248adbb9572abf2dd045a95f682bcd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 23 Oct 2012 01:21:09 +0200 Subject: PM / Hibernate: use rb_entry Since the software suspend extents are organized in an rbtree, use rb_entry instead of container_of, as it is semantically more appropriate in order to get a node as it is iterated. Signed-off-by: Davidlohr Bueso Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3c9d764eb0d8..7c33ed200410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) /* Figure out where to put the new node */ while (*new) { - ext = container_of(*new, struct swsusp_extent, node); + ext = rb_entry(*new, struct swsusp_extent, node); parent = *new; if (swap_offset < ext->start) { /* Try to merge */ -- cgit v1.2.3 From 70f77b3f7ec010ff9624c1f2e39a81babc9e2429 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Jun 2012 19:10:27 +0300 Subject: ftrace: Clear bits properly in reset_iter_read() There is a typo here where '&' is used instead of '|' and it turns the statement into a noop. The original code is equivalent to: iter->flags &= ~((1 << 2) & (1 << 4)); Link: http://lkml.kernel.org/r/20120609161027.GD6488@elgon.mountain Cc: stable@vger.kernel.org # all of them Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9dcf15d38380..51b71594f321 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2437,7 +2437,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) -- cgit v1.2.3 From d60da506cbeb3f1907a740547dd7ef04a93e908e Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 17 Oct 2012 11:56:16 +0900 Subject: tracing: Add a resize function to make one buffer equivalent to another buffer Trace buffer size is now per-cpu, so that there are the following two patterns in resizing of buffers. (1) resize per-cpu buffers to same given size (2) resize per-cpu buffers to another trace_array's buffer size for each CPU (such as preparing the max_tr which is equivalent to the global_trace's size) __tracing_resize_ring_buffer() can be used for (1), and had implemented (2) inside it for resetting the global_trace to the original size. (2) was also implemented in another place. So this patch assembles them in a new function - resize_buffer_duplicate_size(). Link: http://lkml.kernel.org/r/20121017025616.2627.91226.stgit@falsita Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 58 ++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b69cc380322d..64ad9bc4275b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3034,6 +3034,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) tr->data[cpu]->entries = val; } +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_array *tr, + struct trace_array *size_tr, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu]->entries, cpu); + if (ret < 0) + break; + tr->data[cpu]->entries = size_tr->data[cpu]->entries; + } + } else { + ret = ring_buffer_resize(tr->buffer, + size_tr->data[cpu_id]->entries, cpu_id); + if (ret == 0) + tr->data[cpu_id]->entries = + size_tr->data[cpu_id]->entries; + } + + return ret; +} + static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -3058,23 +3083,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r = 0; - - if (cpu == RING_BUFFER_ALL_CPUS) { - int i; - for_each_tracing_cpu(i) { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[i]->entries, - i); - if (r < 0) - break; - } - } else { - r = ring_buffer_resize(global_trace.buffer, - global_trace.data[cpu]->entries, - cpu); - } - + int r = resize_buffer_duplicate_size(&global_trace, + &global_trace, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3212,17 +3222,11 @@ static int tracing_set_tracer(const char *buf) topts = create_trace_option_files(t); if (t->use_max_tr) { - int cpu; /* we need to make per cpu buffer sizes equivalent */ - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(max_tr.buffer, - global_trace.data[cpu]->entries, - cpu); - if (ret < 0) - goto out; - max_tr.data[cpu]->entries = - global_trace.data[cpu]->entries; - } + ret = resize_buffer_duplicate_size(&max_tr, &global_trace, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; } if (t->init) { -- cgit v1.2.3 From 32cdba1e05418909708a17e52505e8b2ba4381d1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 14 Nov 2012 19:03:42 +0100 Subject: uprobes: Use percpu_rw_semaphore to fix register/unregister vs dup_mmap() race This was always racy, but 268720903f87e0b84b161626c4447b81671b5d18 "uprobes: Rework register_for_each_vma() to make it O(n)" should be blamed anyway, it made everything worse and I didn't notice. register/unregister call build_map_info() and then do install/remove breakpoint for every mm which mmaps inode/offset. This can obviously race with fork()->dup_mmap() in between and we can miss the child. uprobe_register() could be easily fixed but unregister is much worse, the new mm inherits "int3" from parent and there is no way to detect this if uprobe goes away. So this patch simply adds percpu_down_read/up_read around dup_mmap(), and percpu_down_write/up_write into register_for_each_vma(). This adds 2 new hooks into dup_mmap() but we can kill uprobe_dup_mmap() and fold it into uprobe_end_dup_mmap(). Reported-by: Srikar Dronamraju Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 26 +++++++++++++++++++++++--- kernel/fork.c | 2 ++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5ce99cfd2e6e..dea7acfbb071 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -33,6 +33,7 @@ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ +#include #include @@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static struct percpu_rw_semaphore dup_mmap_sem; + /* * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe * events active at this time. Probably a fine grained per inode count is @@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct map_info *info; int err = 0; + percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); - if (IS_ERR(info)) - return PTR_ERR(info); + if (IS_ERR(info)) { + err = PTR_ERR(info); + goto out; + } while (info) { struct mm_struct *mm = info->mm; @@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) mmput(mm); info = free_map_info(info); } - + out: + percpu_up_write(&dup_mmap_sem); return err; } @@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } +void uprobe_start_dup_mmap(void) +{ + percpu_down_read(&dup_mmap_sem); +} + +void uprobe_end_dup_mmap(void) +{ + percpu_up_read(&dup_mmap_sem); +} + void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { newmm->uprobes_state.xol_area = NULL; @@ -1597,6 +1614,9 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mmap_mutex[i]); } + if (percpu_init_rwsem(&dup_mmap_sem)) + return -ENOMEM; + return register_die_notifier(&uprobe_exception_nb); } module_init(init_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..c497e57aa654 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; struct mempolicy *pol; + uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); @@ -469,6 +470,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); -- cgit v1.2.3 From 3fbfbf7a3b66ec424042d909f14ba2ddf4372ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Aug 2012 21:35:53 -0700 Subject: rcu: Add callback-free CPUs RCU callback execution can add significant OS jitter and also can degrade both scheduling latency and, in asymmetric multiprocessors, energy efficiency. This commit therefore adds the ability for selected CPUs ("rcu_nocbs=" boot parameter) to have their callbacks offloaded to kthreads. If the "rcu_nocb_poll" boot parameter is also specified, these kthreads will do polling, removing the need for the offloaded CPUs to do wakeups. At least one CPU must be doing normal callback processing: currently CPU 0 cannot be selected as a no-CBs CPU. In addition, attempts to offline the last normal-CBs CPU will fail. This feature was inspired by Jim Houston's and Joe Korty's JRCU, and this commit includes fixes to problems located by Fengguang Wu's kbuild test robot. [ paulmck: Added gfp.h include file as suggested by Fengguang Wu. ] Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 63 ++++++-- kernel/rcutree.h | 47 ++++++ kernel/rcutree_plugin.h | 397 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/rcutree_trace.c | 7 +- 4 files changed, 498 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5ffadcc3bb26..7733eb56e156 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -303,7 +303,8 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); static int cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) { - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && + rdp->nxttail[RCU_DONE_TAIL] != NULL; } /* @@ -312,8 +313,11 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)] && + struct rcu_head **ntp; + + ntp = rdp->nxttail[RCU_DONE_TAIL + + (ACCESS_ONCE(rsp->completed) != rdp->completed)]; + return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && !rcu_gp_in_progress(rsp); } @@ -1123,6 +1127,7 @@ static void init_callback_list(struct rcu_data *rdp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + init_nocb_callback_list(rdp); } /* @@ -1633,6 +1638,10 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + /* No-CBs CPUs do not have orphanable callbacks. */ + if (is_nocb_cpu(rdp->cpu)) + return; + /* * Orphan the callbacks. First adjust the counts. This is safe * because _rcu_barrier() excludes CPU-hotplug operations, so it @@ -1684,6 +1693,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + /* No-CBs CPUs are handled specially. */ + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + return; + /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -2162,9 +2175,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, } } +/* + * Helper function for call_rcu() and friends. The cpu argument will + * normally be -1, indicating "currently running CPU". It may specify + * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * is expected to specify a CPU. + */ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp, bool lazy) + struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -2184,9 +2203,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + int offline; + + if (cpu != -1) + rdp = per_cpu_ptr(rsp->rda, cpu); + offline = !__call_rcu_nocb(rdp, head, lazy); + WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ - WARN_ON_ONCE(1); local_irq_restore(flags); return; } @@ -2215,7 +2239,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 0); + __call_rcu(head, func, &rcu_sched_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); @@ -2224,7 +2248,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state, 0); + __call_rcu(head, func, &rcu_bh_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2676,9 +2700,17 @@ static void _rcu_barrier(struct rcu_state *rsp) * When that callback is invoked, we will know that all of the * corresponding CPU's preceding callbacks have been invoked. */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { + if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (ACCESS_ONCE(rdp->qlen)) { + if (is_nocb_cpu(cpu)) { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, rcu_barrier_callback, + rsp, cpu, 0); + } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -2752,6 +2784,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) #endif rdp->cpu = cpu; rdp->rsp = rsp; + rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2833,6 +2866,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; + int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2846,7 +2880,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); + if (nocb_cpu_expendable(cpu)) + rcu_boost_kthread_setaffinity(rnp, cpu); + else + ret = NOTIFY_BAD; break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -2870,7 +2907,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return NOTIFY_OK; + return ret; } /* @@ -2890,6 +2927,7 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_spawn_nocb_kthreads(rsp); } return 0; } @@ -3085,6 +3123,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + rcu_init_nocb(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d274af357210..488f2ec6b663 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -317,6 +317,18 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + /* 7) Callback offloading. */ +#ifdef CONFIG_RCU_NOCB_CPU + struct rcu_head *nocb_head; /* CBs waiting for kthread. */ + struct rcu_head **nocb_tail; + atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ + atomic_long_t nocb_q_count_lazy; /* (approximate). */ + int nocb_p_count; /* # CBs being invoked by kthread */ + int nocb_p_count_lazy; /* (approximate). */ + wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_kthread; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int cpu; struct rcu_state *rsp; }; @@ -369,6 +381,12 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); +#ifdef CONFIG_RCU_NOCB_CPU + void (*call_remote)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + /* call_rcu() flavor, but for */ + /* placing on remote CPU. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -439,6 +457,8 @@ struct rcu_state { #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ extern struct list_head rcu_struct_flavors; + +/* Sequence through rcu_state structures for each RCU flavor. */ #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) @@ -515,5 +535,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool is_nocb_cpu(int cpu); +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy); +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp); +static bool nocb_cpu_expendable(int cpu); +static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); +static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void init_nocb_callback_list(struct rcu_data *rdp); +static void __init rcu_init_nocb(void); #endif /* #ifndef RCU_TREE_NONCORE */ + +#ifdef CONFIG_RCU_TRACE +#ifdef CONFIG_RCU_NOCB_CPU +/* Sum up queue lengths for tracing. */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; +} +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = 0; + *qll = 0; +} +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5ce3352505e9..6cdc372de34c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include +#include #include #include @@ -36,6 +37,14 @@ #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO #endif +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ +static bool rcu_nocb_poll; /* Offload kthread are to poll. */ +module_param(rcu_nocb_poll, bool, 0444); +static char __initdata nocb_buf[NR_CPUS * 5]; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -76,6 +85,18 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_NOCB_CPU + if (have_rcu_nocb_mask) { + if (cpumask_test_cpu(0, rcu_nocb_mask)) { + cpumask_clear_cpu(0, rcu_nocb_mask); + pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tExperimental polled no-CBs CPUs.\n"); + } +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } #ifdef CONFIG_TREE_PREEMPT_RCU @@ -642,7 +663,7 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 0); + __call_rcu(head, func, &rcu_preempt_state, -1, 0); } EXPORT_SYMBOL_GPL(call_rcu); @@ -656,7 +677,7 @@ EXPORT_SYMBOL_GPL(call_rcu); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state, 1); + __call_rcu(head, func, &rcu_preempt_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -1025,7 +1046,7 @@ static void rcu_preempt_check_callbacks(int cpu) void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state, 1); + __call_rcu(head, func, &rcu_sched_state, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2104,3 +2125,373 @@ static void increment_cpu_stall_ticks(void) } #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +#ifdef CONFIG_RCU_NOCB_CPU + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For each CPU in the set, there is a + * kthread created that pulls the callbacks from the corresponding CPU, + * waits for a grace period to elapse, and invokes the callbacks. + * The no-CBs CPUs do a wake_up() on their kthread when they insert + * a callback into any empty list, unless the rcu_nocb_poll boot parameter + * has been specified, in which case each kthread actively polls its + * CPU. (Which isn't so great for energy efficiency, but which does + * reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callback processing could also in theory be used as + * an energy-efficiency measure because CPUs with no RCU callbacks + * queued are more aggressive about entering dyntick-idle mode. + */ + + +/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + cpulist_parse(str, rcu_nocb_mask); + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +/* Is the specified CPU a no-CPUs CPU? */ +static bool is_nocb_cpu(int cpu) +{ + if (have_rcu_nocb_mask) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +/* + * Enqueue the specified string of rcu_head structures onto the specified + * CPU's no-CBs lists. The CPU is specified by rdp, the head of the + * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy + * counts are supplied by rhcount and rhcount_lazy. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, + struct rcu_head *rhp, + struct rcu_head **rhtp, + int rhcount, int rhcount_lazy) +{ + int len; + struct rcu_head **old_rhpp; + struct task_struct *t; + + /* Enqueue the callback on the nocb list and update counts. */ + old_rhpp = xchg(&rdp->nocb_tail, rhtp); + ACCESS_ONCE(*old_rhpp) = rhp; + atomic_long_add(rhcount, &rdp->nocb_q_count); + atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + + /* If we are not being polled and there is a kthread, awaken it ... */ + t = ACCESS_ONCE(rdp->nocb_kthread); + if (rcu_nocb_poll | !t) + return; + len = atomic_long_read(&rdp->nocb_q_count); + if (old_rhpp == &rdp->nocb_head) { + wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + rdp->qlen_last_fqs_check = 0; + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + wake_up_process(t); /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = LONG_MAX / 2; + } + return; +} + +/* + * This is a helper for __call_rcu(), which invokes this when the normal + * callback queue is inoperable. If this is not a no-CBs CPU, this + * function returns failure back to __call_rcu(), which can complain + * appropriately. + * + * Otherwise, this function queues the callback where the corresponding + * "rcuo" kthread can find it. + */ +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + + if (!is_nocb_cpu(rdp->cpu)) + return 0; + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + return 1; +} + +/* + * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is + * not a no-CBs CPU. + */ +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + long ql = rsp->qlen; + long qll = rsp->qlen_lazy; + + /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + if (!is_nocb_cpu(smp_processor_id())) + return 0; + rsp->qlen = 0; + rsp->qlen_lazy = 0; + + /* First, enqueue the donelist, if any. This preserves CB ordering. */ + if (rsp->orphan_donelist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, + rsp->orphan_donetail, ql, qll); + ql = qll = 0; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + if (rsp->orphan_nxtlist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, + rsp->orphan_nxttail, ql, qll); + ql = qll = 0; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } + return 1; +} + +/* + * There must be at least one non-no-CBs CPU in operation at any given + * time, because no-CBs CPUs are not capable of initiating grace periods + * independently. This function therefore complains if the specified + * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to + * avoid offlining the last such CPU. (Recursion is a wonderful thing, + * but you have to have a base case!) + */ +static bool nocb_cpu_expendable(int cpu) +{ + cpumask_var_t non_nocb_cpus; + int ret; + + /* + * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, + * then offlining this CPU is harmless. Let it happen. + */ + if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) + return 1; + + /* If no memory, play it safe and keep the CPU around. */ + if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) + return 0; + cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); + cpumask_clear_cpu(cpu, non_nocb_cpus); + ret = !cpumask_empty(non_nocb_cpus); + free_cpumask_var(non_nocb_cpus); + return ret; +} + +/* + * Helper structure for remote registry of RCU callbacks. + * This is needed for when a no-CBs CPU needs to start a grace period. + * If it just invokes call_rcu(), the resulting callback will be queued, + * which can result in deadlock. + */ +struct rcu_head_remote { + struct rcu_head *rhp; + call_rcu_func_t *crf; + void (*func)(struct rcu_head *rhp); +}; + +/* + * Register a callback as specified by the rcu_head_remote struct. + * This function is intended to be invoked via smp_call_function_single(). + */ +static void call_rcu_local(void *arg) +{ + struct rcu_head_remote *rhrp = + container_of(arg, struct rcu_head_remote, rhp); + + rhrp->crf(rhrp->rhp, rhrp->func); +} + +/* + * Set up an rcu_head_remote structure and the invoke call_rcu_local() + * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via + * smp_call_function_single(). + */ +static void invoke_crf_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp), + call_rcu_func_t crf) +{ + struct rcu_head_remote rhr; + + rhr.rhp = rhp; + rhr.crf = crf; + rhr.func = func; + smp_call_function_single(0, call_rcu_local, &rhr, 1); +} + +/* + * Helper functions to be passed to wait_rcu_gp(), each of which + * invokes invoke_crf_remote() to register a callback appropriately. + */ +static void __maybe_unused +call_rcu_preempt_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu); +} +static void call_rcu_bh_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_bh); +} +static void call_rcu_sched_remote(struct rcu_head *rhp, + void (*func)(struct rcu_head *rhp)) +{ + invoke_crf_remote(rhp, func, call_rcu_sched); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes + * callbacks queued by the corresponding no-CBs CPU. + */ +static int rcu_nocb_kthread(void *arg) +{ + int c, cl; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_head **tail; + struct rcu_data *rdp = arg; + + /* Each pass through this loop invokes one batch of callbacks */ + for (;;) { + /* If not polling, wait for next batch of callbacks. */ + if (!rcu_nocb_poll) + wait_event(rdp->nocb_wq, rdp->nocb_head); + list = ACCESS_ONCE(rdp->nocb_head); + if (!list) { + schedule_timeout_interruptible(1); + continue; + } + + /* + * Extract queued callbacks, update counts, and wait + * for a grace period to elapse. + */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + c = atomic_long_xchg(&rdp->nocb_q_count, 0); + cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + ACCESS_ONCE(rdp->nocb_p_count) += c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; + wait_rcu_gp(rdp->rsp->call_remote); + + /* Each pass through the following loop invokes a callback. */ + trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + c = cl = 0; + while (list) { + next = list->next; + /* Wait for enqueuing to complete, if needed. */ + while (next == NULL && &list->next != tail) { + schedule_timeout_interruptible(1); + next = list->next; + } + debug_rcu_head_unqueue(list); + local_bh_disable(); + if (__rcu_reclaim(rdp->rsp->name, list)) + cl++; + c++; + local_bh_enable(); + list = next; + } + trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + ACCESS_ONCE(rdp->nocb_p_count) -= c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + rdp->n_cbs_invoked += c; + } + return 0; +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + rdp->nocb_tail = &rdp->nocb_head; + init_waitqueue_head(&rdp->nocb_wq); +} + +/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + struct task_struct *t; + + if (rcu_nocb_mask == NULL) + return; + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(rsp->rda, cpu); + t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp->nocb_kthread) = t; + } +} + +/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ +static void init_nocb_callback_list(struct rcu_data *rdp) +{ + if (rcu_nocb_mask == NULL || + !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + return; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; +} + +/* Initialize the ->call_remote fields in the rcu_state structures. */ +static void __init rcu_init_nocb(void) +{ +#ifdef CONFIG_PREEMPT_RCU + rcu_preempt_state.call_remote = call_rcu_preempt_remote; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + rcu_bh_state.call_remote = call_rcu_bh_remote; + rcu_sched_state.call_remote = call_rcu_sched_remote; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static bool is_nocb_cpu(int cpu) +{ + return false; +} + +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + return 0; +} + +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + return 0; +} + +static bool nocb_cpu_expendable(int cpu) +{ + return 1; +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ +} + +static void init_nocb_callback_list(struct rcu_data *rdp) +{ +} + +static void __init rcu_init_nocb(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index f9512687a6e5..3189f9aa3e84 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -113,6 +113,8 @@ static char convert_kthread_status(unsigned int kthread_status) static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { + long ql, qll; + if (!rdp->beenonline) return; seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", @@ -126,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); + rcu_nocb_q_lengths(rdp, &ql, &qll); + qll += rdp->qlen_lazy; + ql += rdp->qlen; seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - rdp->qlen_lazy, rdp->qlen, + qll, ql, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != -- cgit v1.2.3 From c635a4e1c24e9396db10ed7424b2084908b32252 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Oct 2012 07:29:20 -0700 Subject: rcu: Separate accounting of callbacks from callback-free CPUs Currently, callback invocations from callback-free CPUs are accounted to the CPU that registered the callback, but using the same field that is used for normal callbacks. This makes it impossible to determine from debugfs output whether callbacks are in fact being diverted. This commit therefore adds a separate ->n_nocbs_invoked field in the rcu_data structure in which diverted callback invocations are counted. RCU's debugfs tracing still displays normal callback invocations using ci=, but displayed diverted callbacks with nci=. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 2 +- kernel/rcutree_trace.c | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 488f2ec6b663..4b69291b093d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -287,6 +287,7 @@ struct rcu_data { long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6cdc372de34c..f6e5ec2932b4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2406,7 +2406,7 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); ACCESS_ONCE(rdp->nocb_p_count) -= c; ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; - rdp->n_cbs_invoked += c; + rdp->n_nocbs_invoked += c; } return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3189f9aa3e84..0d095dcaa670 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -148,8 +148,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); #endif /* #ifdef CONFIG_RCU_BOOST */ seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); + seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_nocbs_invoked, + rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata(struct seq_file *m, void *v) -- cgit v1.2.3 From 4e79752c25ec221ac1e28f8875b539ed7631a0db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Nov 2012 13:35:32 -0800 Subject: sched: Mark RCU reader in sched_show_task() When sched_show_task() is invoked from try_to_freeze_tasks(), there is no RCU read-side critical section, resulting in the following splat: [ 125.780730] =============================== [ 125.780766] [ INFO: suspicious RCU usage. ] [ 125.780804] 3.7.0-rc3+ #988 Not tainted [ 125.780838] ------------------------------- [ 125.780875] /home/rafael/src/linux/kernel/sched/core.c:4497 suspicious rcu_dereference_check() usage! [ 125.780946] [ 125.780946] other info that might help us debug this: [ 125.780946] [ 125.781031] [ 125.781031] rcu_scheduler_active = 1, debug_locks = 0 [ 125.781087] 4 locks held by s2ram/4211: [ 125.781120] #0: (&buffer->mutex){+.+.+.}, at: [] sysfs_write_file+0x3f/0x160 [ 125.781233] #1: (s_active#94){.+.+.+}, at: [] sysfs_write_file+0xc8/0x160 [ 125.781339] #2: (pm_mutex){+.+.+.}, at: [] pm_suspend+0x81/0x230 [ 125.781439] #3: (tasklist_lock){.?.?..}, at: [] try_to_freeze_tasks+0x2cd/0x3f0 [ 125.781543] [ 125.781543] stack backtrace: [ 125.781584] Pid: 4211, comm: s2ram Not tainted 3.7.0-rc3+ #988 [ 125.781632] Call Trace: [ 125.781662] [] lockdep_rcu_suspicious+0x103/0x140 [ 125.781719] [] sched_show_task+0x121/0x180 [ 125.781770] [] try_to_freeze_tasks+0x394/0x3f0 [ 125.781823] [] freeze_kernel_threads+0x25/0x80 [ 125.781876] [] pm_suspend+0x165/0x230 [ 125.781924] [] state_store+0x99/0x100 [ 125.781975] [] kobj_attr_store+0x17/0x20 [ 125.782038] [] sysfs_write_file+0xe1/0x160 [ 125.782091] [] vfs_write+0xc6/0x180 [ 125.782138] [] sys_write+0x5a/0xa0 [ 125.782185] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 125.782242] [] system_call_fastpath+0x16/0x1b This commit therefore adds the needed RCU read-side critical section. Reported-by: "Rafael J. Wysocki" Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d4569e0924d..36f260864f65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4474,6 +4474,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { unsigned long free = 0; + int ppid; unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; @@ -4493,8 +4494,11 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), + task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); -- cgit v1.2.3 From d328b836823cd4a76611a45f52e208f8ce3d75d7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Nov 2012 03:02:57 +0000 Subject: userns: make each net (net_ns) belong to a user_ns The user namespace which creates a new network namespace owns that namespace and all resources created in it. This way we can target capability checks for privileged operations against network resources to the user_ns which created the network namespace in which the resource lives. Privilege to the user namespace which owns the network namespace, or any parent user namespace thereof, provides the same privilege to the network resource. This patch is reworked from a version originally by Serge E. Hallyn Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- kernel/nsproxy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b576f7f14bc6..7e1c3de1ce45 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -90,7 +90,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; -- cgit v1.2.3 From 038e7332b8d4c0629a2965e3ede1a92e8e427bd6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 14 Jun 2012 02:31:10 -0700 Subject: userns: make each net (net_ns) belong to a user_ns The user namespace which creates a new network namespace owns that namespace and all resources created in it. This way we can target capability checks for privileged operations against network resources to the user_ns which created the network namespace in which the resource lives. Privilege to the user namespace which owns the network namespace, or any parent user namespace thereof, provides the same privilege to the network resource. This patch is reworked from a version originally by Serge E. Hallyn Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/nsproxy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b576f7f14bc6..7e1c3de1ce45 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -90,7 +90,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; -- cgit v1.2.3 From 49f4d8b93ccf9454284b6f524b96c66d8d7fbccc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Aug 2012 04:25:10 -0700 Subject: pidns: Capture the user namespace and filter ns_last_pid - Capture the the user namespace that creates the pid namespace - Use that user namespace to test if it is ok to write to /proc/sys/kernel/ns_last_pid. Zhao Hongjiang noticed I was missing a put_user_ns in when destroying a pid_ns. I have foloded his patch into this one so that bisects will work properly. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 2 +- kernel/pid.c | 1 + kernel/pid_namespace.c | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 7e1c3de1ce45..ca27d2c5264d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; diff --git a/kernel/pid.c b/kernel/pid.c index aebd4f5aaf41..2a624f1486e1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -78,6 +78,7 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7b07cc0dfb75..b2604950aa50 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -74,7 +75,8 @@ err_alloc: /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, + struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; @@ -102,6 +104,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); + ns->user_ns = get_user_ns(user_ns); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -117,6 +120,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p out_put_parent_pid_ns: put_pid_ns(parent_pid_ns); + put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -131,16 +135,18 @@ static void destroy_pid_namespace(struct pid_namespace *ns) for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); + return create_pid_namespace(user_ns, old_ns); } static void free_pid_ns(struct kref *kref) @@ -239,9 +245,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !capable(CAP_SYS_ADMIN)) + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -250,7 +257,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = ¤t->nsproxy->pid_ns->last_pid; + tmp.data = &pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } -- cgit v1.2.3 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- kernel/cgroup.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid.c | 8 ++++---- kernel/signal.c | 2 +- kernel/sysctl_binary.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..0dbfba2efa77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3390,7 +3390,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134d..738f3564e83b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..7798c247f4b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + ns_of_pid(pid)->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ca27d2c5264d..acc92680381a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; diff --git a/kernel/pid.c b/kernel/pid.c index 2a624f1486e1..3a5f238c1ca0 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -429,7 +429,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -484,7 +484,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -495,7 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..b2445d86f226 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1752,7 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) -- cgit v1.2.3 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 2 -- kernel/pid.c | 21 +++++++++++++++++---- kernel/pid_namespace.c | 14 +++++++------- 3 files changed, 24 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b9..666dc8b06606 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca0..e957f8b09136 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -270,8 +271,12 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + hlist_del_rcu(&upid->pid_chain); + if (--upid->ns->nr_hashed == 0) + schedule_work(&upid->ns->proc_work); + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: @@ -570,6 +582,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa50..84591cfeefc1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -72,6 +72,12 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 @@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); - put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: -- cgit v1.2.3 From 5e1182deb81ae8c68494017c4a8a71811659c870 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Jul 2010 18:50:25 -0700 Subject: pidns: Don't allow new processes in a dead pid namespace. Set nr_hashed to -1 just before we schedule the work to cleanup proc. Test nr_hashed just before we hash a new pid and if nr_hashed is < 0 fail. This guaranteees that processes never enter a pid namespaces after we have cleaned up the state to support processes in a pid namespace. Currently sending SIGKILL to all of the process in a pid namespace as init exists gives us this guarantee but we need something a little stronger to support unsharing and joining a pid namespace. Acked-by: "Serge E. Hallyn" Signed-off-by: Eric W. Biederman --- kernel/pid.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index e957f8b09136..9c219117af36 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -274,8 +274,10 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; hlist_del_rcu(&upid->pid_chain); - if (--upid->ns->nr_hashed == 0) + if (--upid->ns->nr_hashed == 0) { + upid->ns->nr_hashed = -1; schedule_work(&upid->ns->proc_work); + } } spin_unlock_irqrestore(&pidmap_lock, flags); @@ -321,6 +323,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); + if (ns->nr_hashed < 0) + goto out_unlock; for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); @@ -331,6 +335,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) out: return pid; +out_unlock: + spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); -- cgit v1.2.3 From af4b8a83add95ef40716401395b44a1b579965f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 15:03:42 -0700 Subject: pidns: Wait in zap_pid_ns_processes until pid_ns->nr_hashed == 1 Looking at pid_ns->nr_hashed is a bit simpler and it works for disjoint process trees that an unshare or a join of a pid_namespace may create. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 12 ------------ kernel/pid.c | 16 +++++++++++++--- kernel/pid_namespace.c | 15 ++++----------- 3 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..d7fe58db4527 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } diff --git a/kernel/pid.c b/kernel/pid.c index 9c219117af36..6e8da291de49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -273,10 +273,20 @@ void free_pid(struct pid *pid) spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - if (--upid->ns->nr_hashed == 0) { - upid->ns->nr_hashed = -1; - schedule_work(&upid->ns->proc_work); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + ns->nr_hashed = -1; + schedule_work(&ns->proc_work); + break; } } spin_unlock_irqrestore(&pidmap_lock, flags); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 84591cfeefc1..3cc29b830e9e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -217,22 +217,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; -- cgit v1.2.3 From 225778d68d98e7cfe2579f8d8b2d7b76f8541b8b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Aug 2012 08:35:35 -0700 Subject: pidns: Deny strange cases when creating pid namespaces. task_active_pid_ns(current) != current->ns_proxy->pid_ns will soon be allowed to support unshare and setns. The definition of creating a child pid namespace when task_active_pid_ns(current) != current->ns_proxy->pid_ns could be that we create a child pid namespace of current->ns_proxy->pid_ns. However that leads to strange cases like trying to have a single process be init in multiple pid namespaces, which is racy and hard to think about. The definition of creating a child pid namespace when task_active_pid_ns(current) != current->ns_proxy->pid_ns could be that we create a child pid namespace of task_active_pid_ns(current). While that seems less racy it does not provide any utility. Therefore define the semantics of creating a child pid namespace when task_active_pid_ns(current) != current->ns_proxy->pid_ns to be that the pid namespace creation fails. That is easy to implement and easy to think about. Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 3cc29b830e9e..0dbbc66b6ec6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -146,6 +146,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, return get_pid_ns(old_ns); if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); + if (task_active_pid_ns(current) != old_ns) + return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); } -- cgit v1.2.3 From 57e8391d327609cbf12d843259c968b9e5c1838f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:17:03 -0800 Subject: pidns: Add setns support - Pid namespaces are designed to be inescapable so verify that the passed in pid namespace is a child of the currently active pid namespace or the currently active pid namespace itself. Allowing the currently active pid namespace is important so the effects of an earlier setns can be cancelled. Signed-off-by: Eric W. Biederman --- kernel/pid_namespace.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0dbbc66b6ec6..f78fc48c86bc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -301,6 +301,60 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } +static void *pidns_get(struct task_struct *task) +{ + struct pid_namespace *ns; + + rcu_read_lock(); + ns = get_pid_ns(task_active_pid_ns(task)); + rcu_read_unlock(); + + return ns; +} + +static void pidns_put(void *ns) +{ + put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *ancestor, *new = ns; + + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Only allow entering the current active pid namespace + * or a child of the current active pid namespace. + * + * This is required for fork to return a usable pid value and + * this maintains the property that processes and their + * children can not escape their current pid namespace. + */ + if (new->level < active->level) + return -EINVAL; + + ancestor = new; + while (ancestor->level > active->level) + ancestor = ancestor->parent; + if (ancestor != active) + return -EINVAL; + + put_pid_ns(nsproxy->pid_ns); + nsproxy->pid_ns = get_pid_ns(new); + return 0; +} + +const struct proc_ns_operations pidns_operations = { + .name = "pid", + .type = CLONE_NEWPID, + .get = pidns_get, + .put = pidns_put, + .install = pidns_install, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); -- cgit v1.2.3 From 1c4042c29bd2e85aac4110552ca8ade763762e84 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Jul 2010 17:10:36 -0700 Subject: pidns: Consolidate initialzation of special init task state Instead of setting child_reaper and SIGNAL_UNKILLABLE one way for the system init process, and another way for pid namespace init processes test pid->nr == 1 and use the same code for both. For the global init this results in SIGNAL_UNKILLABLE being set much earlier in the initialization process. This is a small cleanup and it paves the way for allowing unshare and enter of the pid namespace as that path like our global init also will not set CLONE_NEWPID. Signed-off-by: Eric W. Biederman --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 666dc8b06606..0f2bbce311fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1039,8 +1039,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1441,8 +1439,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) + if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); -- cgit v1.2.3 From 50804fe3737ca6a5942fdc2057a18a8141d00141 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 15:41:50 -0800 Subject: pidns: Support unsharing the pid namespace. Unsharing of the pid namespace unlike unsharing of other namespaces does not take affect immediately. Instead it affects the children created with fork and clone. The first of these children becomes the init process of the new pid namespace, the rest become oddball children of pid 0. From the point of view of the new pid namespace the process that created it is pid 0, as it's pid does not map. A couple of different semantics were considered but this one was settled on because it is easy to implement and it is usable from pam modules. The core reasons for the existence of unshare. I took a survey of the callers of pam modules and the following appears to be a representative sample of their logic. { setup stuff include pam child = fork(); if (!child) { setuid() exec /bin/bash } waitpid(child); pam and other cleanup } As you can see there is a fork to create the unprivileged user space process. Which means that the unprivileged user space process will appear as pid 1 in the new pid namespace. Further most login processes do not cope with extraneous children which means shifting the duty of reaping extraneous child process to the creator of those extraneous children makes the system more comprehensible. The practical reason for this set of pid namespace semantics is that it is simple to implement and verify they work correctly. Whereas an implementation that requres changing the struct pid on a process comes with a lot more races and pain. Not the least of which is that glibc caches getpid(). These semantics are implemented by having two notions of the pid namespace of a proces. There is task_active_pid_ns which is the pid namspace the process was created with and the pid namespace that all pids are presented to that process in. The task_active_pid_ns is stored in the struct pid of the task. Then there is the pid namespace that will be used for children that pid namespace is stored in task->nsproxy->pid_ns. Signed-off-by: Eric W. Biederman --- kernel/fork.c | 32 +++++++++++++++++++++++++------- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 -- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f2bbce311fc..811ffbad7889 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; + } + if (clone_flags & CLONE_NEWUSER) { /* hopefully this check will go away when userns support is * complete */ @@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index acc92680381a..b8d4d8709d70 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -188,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f78fc48c86bc..68508d330634 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -144,8 +144,6 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) - return ERR_PTR(-EINVAL); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); -- cgit v1.2.3 From 771b1371686e0a63e938ada28de020b9a0040f55 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 21:08:32 -0700 Subject: vfs: Add a user namespace reference from struct mnt_namespace This will allow for support for unprivileged mounts in a new user namespace. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b8d4d8709d70..7f8b051fc19f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -66,7 +66,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, task_cred_xxx(tsk, user_ns), new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; -- cgit v1.2.3 From 5eaf563e53294d6696e651466697eb9d491f3946 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Nov 2011 17:22:31 -0800 Subject: userns: Allow unprivileged users to create user namespaces. Now that we have been through every permission check in the kernel having uid == 0 and gid == 0 in your local user namespace no longer adds any special privileges. Even having a full set of caps in your local user namespace is safe because capabilies are relative to your local user namespace, and do not confer unexpected privileges. Over the long term this should allow much more of the kernels functionality to be safely used by non-root users. Functionality like unsharing the mount namespace that is only unsafe because it can fool applications whose privileges are raised when they are executed. Since those applications have no privileges in a user namespaces it becomes safe to spoof and confuse those applications all you want. Those capabilities will still need to be enabled carefully because we may still need things like rlimits on the number of unprivileged mounts but that is to avoid DOS attacks not to avoid fooling root owned processes. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 811ffbad7889..8c29abb19014 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1569,14 +1569,6 @@ long do_fork(unsigned long clone_flags, if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; } - if (clone_flags & CLONE_NEWUSER) { - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; - } /* * Determine whether and which event to report to ptracer. When -- cgit v1.2.3 From fd25b4c2f226de818e1d2b71e3e681d28bcaf5ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Nov 2012 18:21:22 +0100 Subject: vtime: Remove the underscore prefix invasion Prepending irq-unsafe vtime APIs with underscores was actually a bad idea as the result is a big mess in the API namespace that is even waiting to be further extended. Also these helpers are always called from irq safe callers except kvm. Just provide a vtime_account_system_irqsafe() for this specific case so that we can remove the underscore prefix on other vtime functions. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- kernel/sched/cputime.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8d859dae5bed..c0aa1ba752ea 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -433,20 +433,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *st = cputime.stime; } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_system_irqsafe(struct task_struct *tsk) { unsigned long flags; local_irq_save(flags); - __vtime_account_system(tsk); + vtime_account_system(tsk); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * __vtime_account_system() and __vtime_account_idle(). Archs that + * vtime_account_system() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -459,9 +459,9 @@ void vtime_account(struct task_struct *tsk) local_irq_save(flags); if (in_interrupt() || !is_idle_task(tsk)) - __vtime_account_system(tsk); + vtime_account_system(tsk); else - __vtime_account_idle(tsk); + vtime_account_idle(tsk); local_irq_restore(flags); } -- cgit v1.2.3 From e3942ba04052364d3c6454103362cafd87456010 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Nov 2012 00:24:25 +0100 Subject: vtime: Consolidate a bit the ctx switch code On ia64 and powerpc, vtime context switch only consists in flushing system and user pending time, plus a few arch housekeeping. Consolidate that into a generic implementation. s390 is a special case because pending user and system time accounting there is hard to dissociate. So it's keeping its own implementation. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- kernel/sched/cputime.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c0aa1ba752ea..2e8d34aac97e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -443,6 +443,19 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + + vtime_account_user(prev); + arch_vtime_task_switch(prev); +} +#endif + /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement -- cgit v1.2.3 From 1017769bd0073f0a73e066377cd79a10cf0a33ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Nov 2012 00:26:54 +0100 Subject: vtime: No need to disable irqs on vtime_account() vtime_account() is only called from irq entry. irqs are always disabled at this point so we can safely remove the irq disabling guards on that function. Signed-off-by: Frederic Weisbecker Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens --- kernel/sched/cputime.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2e8d34aac97e..80b2fd5a7cf0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -467,16 +467,10 @@ void vtime_task_switch(struct task_struct *prev) #ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account(struct task_struct *tsk) { - unsigned long flags; - - local_irq_save(flags); - if (in_interrupt() || !is_idle_task(tsk)) vtime_account_system(tsk); else vtime_account_idle(tsk); - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(vtime_account); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -- cgit v1.2.3 From 175431635ec09b1d1bba04979b006b99e8305a83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:35 -0800 Subject: cgroup: remove incorrect dget/dput() pair in cgroup_create_dir() cgroup_create_dir() does weird dancing with dentry refcnt. On success, it gets and then puts it achieving nothing. On failure, it puts but there isn't no matching get anywhere leading to the following oops if cgroup_create_file() fails for whatever reason. ------------[ cut here ]------------ kernel BUG at /work/os/work/fs/dcache.c:552! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU 2 Pid: 697, comm: mkdir Not tainted 3.7.0-rc4-work+ #3 Bochs Bochs RIP: 0010:[] [] dput+0x1dc/0x1e0 RSP: 0018:ffff88001a3ebef8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88000e5b1ef8 RCX: 0000000000000403 RDX: 0000000000000303 RSI: 2000000000000000 RDI: ffff88000e5b1f58 RBP: ffff88001a3ebf18 R08: ffffffff82c76960 R09: 0000000000000001 R10: ffff880015022080 R11: ffd9bed70f48a041 R12: 00000000ffffffea R13: 0000000000000001 R14: ffff88000e5b1f58 R15: 00007fff57656d60 FS: 00007ff05fcb3800(0000) GS:ffff88001fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004046f0 CR3: 000000001315f000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mkdir (pid: 697, threadinfo ffff88001a3ea000, task ffff880015022080) Stack: ffff88001a3ebf48 00000000ffffffea 0000000000000001 0000000000000000 ffff88001a3ebf38 ffffffff811cc889 0000000000000001 ffff88000e5b1ef8 ffff88001a3ebf68 ffffffff811d1fc9 ffff8800198d7f18 ffff880019106ef8 Call Trace: [] done_path_create+0x19/0x50 [] sys_mkdirat+0x59/0x80 [] sys_mkdir+0x19/0x20 [] system_call_fastpath+0x16/0x1b Code: 00 48 8d 90 18 01 00 00 48 89 93 c0 00 00 00 4c 89 a0 18 01 00 00 48 8b 83 a0 00 00 00 83 80 28 01 00 00 01 e8 e6 6f a0 00 eb 92 <0f> 0b 66 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 49 89 fe 41 RIP [] dput+0x1dc/0x1e0 RSP ---[ end trace 1277bcfd9561ddb0 ]--- Fix it by dropping the unnecessary dget/dput() pair. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f8fa6aa371b..d0803f097f87 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2684,9 +2684,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); } - dput(dentry); return error; } -- cgit v1.2.3 From 2243076ad128d0cc196cf6597e6ddcf6bc907676 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:35 -0800 Subject: cgroup: initialize cgrp->allcg_node in init_cgroup_housekeeping() Not strictly necessary but it's annoying to have uninitialized list_head around. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d0803f097f87..ed0e177c4650 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1381,6 +1381,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); -- cgit v1.2.3 From 28fd6f30ac3efd9170ae1ac89f3521d53b5eb83a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: open-code cgroup_create_dir() The operation order of cgroup creation is about to change and cgroup_create_dir() is more of a hindrance than a proper abstraction. Open-code it by moving the parent nlink adjustment next to self nlink adjustment in cgroup_create_file() and the rest to cgroup_create(). This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed0e177c4650..b042673171e9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2652,6 +2652,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, /* start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); + inc_nlink(dentry->d_parent->d_inode); /* start with the directory inode held, so that we can * populate it without racing with another mkdir */ @@ -2666,30 +2667,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, return 0; } -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - umode_t mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - rcu_assign_pointer(cgrp->dentry, dentry); - } - - return error; -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -4138,10 +4115,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - err = cgroup_create_dir(cgrp, dentry, mode); + err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) goto err_remove; + dentry->d_fsdata = cgrp; + rcu_assign_pointer(cgrp->dentry, dentry); + for_each_subsys(root, ss) { /* each css holds a ref to the cgroup's dentry */ dget(dentry); -- cgit v1.2.3 From 4e139afc22cb98d0d032ffce0285bfcc73ca5217 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: create directory before linking while creating a new cgroup While creating a new cgroup, cgroup_create() links the newly allocated cgroup into various places before trying to create its directory. Because cgroup life-cycle is tied to the vfs objects, this makes it impossible to use cgroup_rmdir() for rolling back creation - the removal logic depends on having full vfs objects. This patch moves directory creation above linking and collect linking operations to one place. This allows directory creation failure to share error exit path with css allocation failures and any failure sites afterwards (to be added later) can use cgroup_rmdir() logic to undo creation. Note that this also makes the memory barriers around cgroup->dentry, which currently is misleadingly using RCU operations, unnecessary. This will be handled in the next patch. While at it, locking BUG_ON() on i_mutex is converted to lockdep_assert_held(). v2: Patch originally removed %NULL dentry check in cgroup_path(); however, Li pointed out that this patch doesn't make it unnecessary as ->create() may call cgroup_path(). Drop the change for now. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b042673171e9..d62a529db2f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4112,15 +4112,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - + /* + * Create directory. cgroup_create_file() returns with the new + * directory locked on success so that it can be populated without + * dropping cgroup_mutex. + */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_remove; + goto err_destroy; + lockdep_assert_held(&dentry->d_inode->i_mutex); + /* allocation complete, commit to creation */ dentry->d_fsdata = cgrp; rcu_assign_pointer(cgrp->dentry, dentry); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + root->number_of_cgroups++; for_each_subsys(root, ss) { /* each css holds a ref to the cgroup's dentry */ @@ -4131,11 +4138,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_create(cgrp); } - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); - - list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); /* If err < 0, we have a half-filled directory - oh well ;) */ @@ -4144,20 +4146,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; - err_remove: - - list_del_rcu(&cgrp->sibling); - root->number_of_cgroups--; - - err_destroy: - +err_destroy: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(cgrp); } - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free: -- cgit v1.2.3 From febfcef60d4f9457785b45aab548bc7ee5ea158f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: cgroup->dentry isn't a RCU pointer cgroup->dentry is marked and used as a RCU pointer; however, it isn't one - the final dentry put doesn't go through call_rcu(). cgroup and dentry share the same RCU freeing rule via synchronize_rcu() in cgroup_diput() (kfree_rcu() used on cgrp is unnecessary). If cgrp is accessible under RCU read lock, so is its dentry and dereferencing cgrp->dentry doesn't need any further RCU protection or annotation. While not being accurate, before the previous patch, the RCU accessors served a purpose as memory barriers - cgroup->dentry used to be assigned after the cgroup was made visible to cgroup_path(), so the assignment and dereferencing in cgroup_path() needed the memory barrier pair. Now that list_add_tail_rcu() happens after cgroup->dentry is assigned, this no longer is necessary. Remove the now unnecessary and misleading RCU annotations from cgroup->dentry. To make up for the removal of rcu_dereference_check() in cgroup_path(), add an explicit rcu_lockdep_assert(), which asserts the dereference rule of @cgrp, not cgrp->dentry. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d62a529db2f7..affc76d7f739 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1756,9 +1756,11 @@ static struct kobject *cgroup_kobj; */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { + struct dentry *dentry = cgrp->dentry; char *start; - struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + + rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), + "cgroup_path() called without proper locking"); if (!dentry || cgrp == dummytop) { /* @@ -1782,8 +1784,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) if (!cgrp) break; - dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + dentry = cgrp->dentry; if (!cgrp->parent) continue; if (--start < buf) @@ -4124,7 +4125,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* allocation complete, commit to creation */ dentry->d_fsdata = cgrp; - rcu_assign_pointer(cgrp->dentry, dentry); + cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; -- cgit v1.2.3 From 38b53abaa3e0c7e750ef73eee919cf42eee6b134 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: make CSS_* flags bit masks instead of bit positions Currently, CSS_* flags are defined as bit positions and manipulated using atomic bitops. There's no reason to use atomic bitops for them and bit positions are clunkier to deal with than bit masks. Make CSS_* bit masks instead and use the usual C bitwise operators to access them. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index affc76d7f739..82ad8785fafe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4020,7 +4020,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags = 0; css->id = NULL; if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); + css->flags |= CSS_ROOT; BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; -- cgit v1.2.3 From b48c6a80a0f7584056f768268fc12b87745a393f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: trivial cleanup for cgroup_init/load_subsys() Consistently use @css and @dummytop in these two functions instead of referring to them indirectly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 82ad8785fafe..6cc693b91c4a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4332,7 +4332,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + init_css_set.subsys[ss->subsys_id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4344,7 +4344,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) ss->active = 1; if (ss->post_create) - ss->post_create(&ss->root->top_cgroup); + ss->post_create(dummytop); /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ @@ -4456,7 +4456,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->active = 1; if (ss->post_create) - ss->post_create(&ss->root->top_cgroup); + ss->post_create(dummytop); /* success! */ mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From 648bb56d076bde31113f09a7d24d95bc8d4155ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: lock cgroup_mutex in cgroup_init_subsys() Make cgroup_init_subsys() grab cgroup_mutex while initializing a subsystem so that all helpers and callbacks are called under the context they expect. This isn't strictly necessary as cgroup_init_subsys() doesn't race with anybody but will allow adding lockdep assertions. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6cc693b91c4a..09751657abdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4317,6 +4317,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_mutex); + /* init base cftset */ cgroup_init_cftsets(ss); @@ -4346,6 +4348,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) if (ss->post_create) ss->post_create(dummytop); + mutex_unlock(&cgroup_mutex); + /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ BUG_ON(ss->module); -- cgit v1.2.3 From 02ae7486d05ae6df8395409a4945b2420f1e35c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: fix harmless bugs in cgroup_load_subsys() fail path and cgroup_unload_subsys() * If idr init fails, cgroup_load_subsys() cleared dummytop->subsys[] before calilng ->destroy() making CSS inaccessible to the callback, and didn't unlink ss->sibling. As no modular controller uses ->use_id, this doesn't cause any actual problems. * cgroup_unload_subsys() was forgetting to free idr, call ->pre_destroy() and clear ->active. As there currently is no modular controller which uses ->use_id, ->pre_destroy() or ->active, this doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09751657abdc..5679cb1ce43f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4420,9 +4420,10 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (ss->use_id) { int ret = cgroup_init_idr(ss, css); if (ret) { - dummytop->subsys[ss->subsys_id] = NULL; ss->destroy(dummytop); + dummytop->subsys[ss->subsys_id] = NULL; subsys[ss->subsys_id] = NULL; + list_del_init(&ss->sibling); mutex_unlock(&cgroup_mutex); return ret; } @@ -4490,7 +4491,19 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) */ BUG_ON(ss->root != &rootnode); + /* ->pre_destroy() should be called outside cgroup_mutex for now */ + if (ss->pre_destroy) + ss->pre_destroy(dummytop); + mutex_lock(&cgroup_mutex); + + ss->active = 0; + + if (ss->use_id) { + idr_remove_all(&ss->idr); + idr_destroy(&ss->idr); + } + /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; -- cgit v1.2.3 From 42809dd4225b2f3127a4804314a1b33608620d96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: separate out cgroup_destroy_locked() Separate out cgroup_destroy_locked() from cgroup_destroy(). This will be later used in cgroup_create() failure path. While at it, add lockdep asserts on i_mutex and cgroup_mutex, and move @d and @parent assignments to their declarations. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5679cb1ce43f..4412d9694f13 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -242,6 +242,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +static int cgroup_destroy_locked(struct cgroup *cgrp); + #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) { @@ -4209,22 +4211,20 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_destroy_locked(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; + struct dentry *d = cgrp->dentry; + struct cgroup *parent = cgrp->parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - /* the vfs holds both inode->i_mutex already */ - mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); + lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) return -EBUSY; - } /* * Block new css_tryget() by deactivating refcnt and mark @cgrp @@ -4243,7 +4243,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) /* * Tell subsystems to initate destruction. pre_destroy() should be * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix - * potential deadlock in pre_destroy") for details. + * potential deadlock in pre_destroy") for details. This temporary + * unlocking should go away once cgroup_mutex is unexported from + * controllers. */ mutex_unlock(&cgroup_mutex); for_each_subsys(cgrp->root, ss) @@ -4268,11 +4270,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - list_del_init(&cgrp->allcg_node); - d = dget(cgrp->dentry); - + dget(d); cgroup_d_remove_dir(d); dput(d); @@ -4293,10 +4293,20 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } spin_unlock(&cgrp->event_list_lock); - mutex_unlock(&cgroup_mutex); return 0; } +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(dentry->d_fsdata); + mutex_unlock(&cgroup_mutex); + + return ret; +} + static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); -- cgit v1.2.3 From a31f2d3ff7fe20cbe2a143515a7d7c408b29dd0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: introduce CSS_ONLINE flag and on/offline_css() helpers New helpers on/offline_css() respectively wrap ->post_create() and ->pre_destroy() invocations. online_css() sets CSS_ONLINE after ->post_create() is complete and offline_css() invokes ->pre_destroy() iff CSS_ONLINE is set and clears it while also handling the temporary dropping of cgroup_mutex. This patch doesn't introduce any behavior change at the moment but will be used to improve cgroup_create() failure path and allow ->post_create() to fail. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4412d9694f13..78a3d5c0968e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4035,6 +4035,42 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } +/* invoke ->post_create() on a new CSS and mark it online */ +static void online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + lockdep_assert_held(&cgroup_mutex); + + if (ss->post_create) + ss->post_create(cgrp); + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; +} + +/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + lockdep_assert_held(&cgroup_mutex); + + if (!(css->flags & CSS_ONLINE)) + return; + + /* + * pre_destroy() should be called with cgroup_mutex unlocked. See + * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for + * details. This temporary unlocking should go away once + * cgroup_mutex is unexported from controllers. + */ + if (ss->pre_destroy) { + mutex_unlock(&cgroup_mutex); + ss->pre_destroy(cgrp); + mutex_lock(&cgroup_mutex); + } + + cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4137,8 +4173,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, dget(dentry); /* creation succeeded, notify subsystems */ - if (ss->post_create) - ss->post_create(cgrp); + online_css(ss, cgrp); } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); @@ -4240,18 +4275,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } set_bit(CGRP_REMOVED, &cgrp->flags); - /* - * Tell subsystems to initate destruction. pre_destroy() should be - * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix - * potential deadlock in pre_destroy") for details. This temporary - * unlocking should go away once cgroup_mutex is unexported from - * controllers. - */ - mutex_unlock(&cgroup_mutex); + /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) - ss->pre_destroy(cgrp); - mutex_lock(&cgroup_mutex); + offline_css(ss, cgrp); /* * Put all the base refs. Each css holds an extra reference to the @@ -4354,9 +4380,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; - - if (ss->post_create) - ss->post_create(dummytop); + online_css(ss, dummytop); mutex_unlock(&cgroup_mutex); @@ -4469,9 +4493,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; - - if (ss->post_create) - ss->post_create(dummytop); + online_css(ss, dummytop); /* success! */ mutex_unlock(&cgroup_mutex); @@ -4501,12 +4523,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) */ BUG_ON(ss->root != &rootnode); - /* ->pre_destroy() should be called outside cgroup_mutex for now */ - if (ss->pre_destroy) - ss->pre_destroy(dummytop); - mutex_lock(&cgroup_mutex); + offline_css(ss, dummytop); ss->active = 0; if (ss->use_id) { -- cgit v1.2.3 From d19e19de48aa0b90c56cd93c8a46ebac46273429 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: simplify cgroup_load_subsys() failure path Now that cgroup_unload_subsys() can tell whether the root css is online or not, we can safely call cgroup_unload_subsys() after idr init failure in cgroup_load_subsys(). Replace the manual unrolling and invoke cgroup_unload_subsys() on failure. This drops cgroup_mutex inbetween but should be safe as the subsystem will fail try_module_get() and thus can't be mounted inbetween. As this means that cgroup_unload_subsys() can be called before css_sets are rehashed, remove BUG_ON() on %NULL css_set->subsys[] from cgroup_unload_subsys(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78a3d5c0968e..166b5141f3d4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4400,8 +4400,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { - int i; struct cgroup_subsys_state *css; + int i, ret; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4452,15 +4452,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) init_cgroup_css(css, ss, dummytop); /* init_idr must be after init_cgroup_css because it sets css->id. */ if (ss->use_id) { - int ret = cgroup_init_idr(ss, css); - if (ret) { - ss->destroy(dummytop); - dummytop->subsys[ss->subsys_id] = NULL; - subsys[ss->subsys_id] = NULL; - list_del_init(&ss->sibling); - mutex_unlock(&cgroup_mutex); - return ret; - } + ret = cgroup_init_idr(ss, css); + if (ret) + goto err_unload; } /* @@ -4498,6 +4492,12 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* success! */ mutex_unlock(&cgroup_mutex); return 0; + +err_unload: + mutex_unlock(&cgroup_mutex); + /* @ss can't be mounted here as try_module_get() would fail */ + cgroup_unload_subsys(ss); + return ret; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); @@ -4548,7 +4548,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) struct css_set *cg = link->cg; hlist_del(&cg->hlist); - BUG_ON(!cg->subsys[ss->subsys_id]); cg->subsys[ss->subsys_id] = NULL; hhead = css_set_hash(cg->subsys); hlist_add_head(&cg->hlist, hhead); -- cgit v1.2.3 From b8a2df6a5b576d04f78f54abf254c283527d8bbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: use mutex_trylock() when grabbing i_mutex of a new cgroup directory All cgroup directory i_mutexes nest outside cgroup_mutex; however, new directory creation is a special case. A new cgroup directory is created while holding cgroup_mutex. Populating the new directory requires both the new directory's i_mutex and cgroup_mutex. Because all directory i_mutexes nest outside cgroup_mutex, grabbing both requires releasing cgroup_mutex first, which isn't a good idea as the new cgroup isn't yet ready to be manipulated by other cgroup opreations. This is worked around by grabbing the new directory's i_mutex while holding cgroup_mutex before making it visible. As there's no other user at that point, grabbing the i_mutex under cgroup_mutex can't lead to deadlock. cgroup_create_file() was using I_MUTEX_CHILD to tell lockdep not to worry about the reverse locking order; however, this creates pseudo locking dependency cgroup_mutex -> I_MUTEX_CHILD, which isn't true - all directory i_mutexes are still nested outside cgroup_mutex. This pseudo locking dependency can lead to spurious lockdep warnings. Use mutex_trylock() instead. This will always succeed and lockdep doesn't create any locking dependency for it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 166b5141f3d4..c53f42e31704 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2657,9 +2657,15 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, inc_nlink(inode); inc_nlink(dentry->d_parent->d_inode); - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + /* + * Control reaches here with cgroup_mutex held. + * @inode->i_mutex should nest outside cgroup_mutex but we + * want to populate it immediately without releasing + * cgroup_mutex. As @inode isn't visible to anyone else + * yet, trylock will always succeed without affecting + * lockdep checks. + */ + WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; -- cgit v1.2.3 From 4b8b47eb0001a89f271d671d959b235faa8f03e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: update cgroup_create() failure path cgroup_create() was ignoring failure of cgroupfs files. Update it such that, if file creation fails, it rolls back by calling cgroup_destroy_locked() and returns failure. Note that error out goto labels are renamed. The labels are a bit confusing but will become better w/ later cgroup operation renames. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c53f42e31704..027b66e66698 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4107,7 +4107,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free; + goto err_free_cgrp; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4135,13 +4135,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); - goto err_destroy; + goto err_free_all; } init_cgroup_css(css, ss, cgrp); if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) - goto err_destroy; + goto err_free_all; } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) @@ -4164,7 +4164,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_destroy; + goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ @@ -4183,14 +4183,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); - /* If err < 0, we have a half-filled directory - oh well ;) */ + if (err) + goto err_destroy; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; -err_destroy: +err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(cgrp); @@ -4198,9 +4199,15 @@ err_destroy: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); -err_free: +err_free_cgrp: kfree(cgrp); return err; + +err_destroy: + cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + return err; } static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -- cgit v1.2.3 From b1929db42f8a649d9a9e397119f628c27fd4021f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: allow ->post_create() to fail There could be cases where controllers want to do initialization operations which may fail from ->post_create(). This patch makes ->post_create() return -errno to indicate failure and online_css() relay such failures. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 29 +++++++++++++++++++---------- kernel/cgroup_freezer.c | 4 +++- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 027b66e66698..c389f4258681 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,14 +4041,18 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online */ -static void online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +/* invoke ->post_create() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + int ret = 0; + lockdep_assert_held(&cgroup_mutex); if (ss->post_create) - ss->post_create(cgrp); - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + ret = ss->post_create(cgrp); + if (!ret) + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + return ret; } /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ @@ -4174,12 +4178,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - for_each_subsys(root, ss) { - /* each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) dget(dentry); - /* creation succeeded, notify subsystems */ - online_css(ss, cgrp); + /* creation succeeded, notify subsystems */ + for_each_subsys(root, ss) { + err = online_css(ss, cgrp); + if (err) + goto err_destroy; } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); @@ -4393,7 +4400,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; - online_css(ss, dummytop); + BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4500,7 +4507,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; - online_css(ss, dummytop); + ret = online_css(ss, dummytop); + if (ret) + goto err_unload; /* success! */ mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 670a4af7dc94..ee8bb671688c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -112,7 +112,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) * parent's freezing state while holding both parent's and our * freezer->lock. */ -static void freezer_post_create(struct cgroup *cgroup) +static int freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); struct freezer *parent = parent_freezer(freezer); @@ -136,6 +136,8 @@ static void freezer_post_create(struct cgroup *cgroup) spin_unlock(&freezer->lock); if (parent) spin_unlock_irq(&parent->lock); + + return 0; } /** -- cgit v1.2.3 From 92fb97487a7e41b222c1417cabd1d1ab7cc3a48c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() Rename cgroup_subsys css lifetime related callbacks to better describe what their roles are. Also, update documentation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 51 +++++++++++++++++++++++++------------------------ kernel/cgroup_freezer.c | 20 +++++++++---------- kernel/cpuset.c | 10 +++++----- kernel/events/core.c | 8 ++++---- kernel/sched/core.c | 16 ++++++++-------- 5 files changed, 53 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c389f4258681..d35463bab487 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -876,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(cgrp); + ss->css_free(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4048,8 +4048,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - if (ss->post_create) - ret = ss->post_create(cgrp); + if (ss->css_online) + ret = ss->css_online(cgrp); if (!ret) cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; return ret; @@ -4067,14 +4067,14 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; /* - * pre_destroy() should be called with cgroup_mutex unlocked. See + * css_offline() should be called with cgroup_mutex unlocked. See * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for * details. This temporary unlocking should go away once * cgroup_mutex is unexported from controllers. */ - if (ss->pre_destroy) { + if (ss->css_offline) { mutex_unlock(&cgroup_mutex); - ss->pre_destroy(cgrp); + ss->css_offline(cgrp); mutex_lock(&cgroup_mutex); } @@ -4136,7 +4136,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->create(cgrp); + css = ss->css_alloc(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4147,7 +4147,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - /* At error, ->destroy() callback has to free assigned ID. */ + /* At error, ->css_free() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) ss->post_clone(cgrp); @@ -4201,7 +4201,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(cgrp); + ss->css_free(cgrp); } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ @@ -4381,7 +4381,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4425,7 +4425,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->create == NULL || ss->destroy == NULL) + ss->css_alloc == NULL || ss->css_free == NULL) return -EINVAL; /* @@ -4454,10 +4454,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = ss; /* - * no ss->create seems to need anything important in the ss struct, so - * this can happen first (i.e. before the rootnode attachment). + * no ss->css_alloc seems to need anything important in the ss + * struct, so this can happen first (i.e. before the rootnode + * attachment). */ - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[ss->subsys_id] = NULL; @@ -4577,12 +4578,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to free - * before marking as null because ss->destroy needs the cgrp->subsys - * pointer to find their state. note that this also takes care of - * freeing the css_id. + * remove subsystem's css from the dummytop and free it - need to + * free before marking as null because ss->css_free needs the + * cgrp->subsys pointer to find their state. note that this also + * takes care of freeing the css_id. */ - ss->destroy(dummytop); + ss->css_free(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4626,8 +4627,8 @@ int __init cgroup_init_early(void) BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); + BUG_ON(!ss->css_alloc); + BUG_ON(!ss->css_free); if (ss->subsys_id != i) { printk(KERN_ERR "cgroup: Subsys %s id == %d\n", ss->name, ss->subsys_id); @@ -5439,7 +5440,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5449,7 +5450,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) return css; } -static void debug_destroy(struct cgroup *cont) +static void debug_css_free(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } @@ -5578,8 +5579,8 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_subsys = { .name = "debug", - .create = debug_create, - .destroy = debug_destroy, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ee8bb671688c..75dda1ea5026 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -92,7 +92,7 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) { struct freezer *freezer; @@ -105,14 +105,14 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) } /** - * freezer_post_create - commit creation of a freezer cgroup + * freezer_css_online - commit creation of a freezer cgroup * @cgroup: cgroup being created * * We're committing to creation of @cgroup. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_post_create(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); struct freezer *parent = parent_freezer(freezer); @@ -141,13 +141,13 @@ static int freezer_post_create(struct cgroup *cgroup) } /** - * freezer_pre_destroy - initiate destruction of @cgroup + * freezer_css_offline - initiate destruction of @cgroup * @cgroup: cgroup being destroyed * * @cgroup is going away. Mark it dead and decrement system_freezing_count * if it was holding one. */ -static void freezer_pre_destroy(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -161,7 +161,7 @@ static void freezer_pre_destroy(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_destroy(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup *cgroup) { kfree(cgroup_freezer(cgroup)); } @@ -477,10 +477,10 @@ static struct cftype files[] = { struct cgroup_subsys freezer_subsys = { .name = "freezer", - .create = freezer_create, - .post_create = freezer_post_create, - .pre_destroy = freezer_pre_destroy, - .destroy = freezer_destroy, + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d7..06931337c4e5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1821,11 +1821,11 @@ static void cpuset_post_clone(struct cgroup *cgroup) } /* - * cpuset_create - create a cpuset + * cpuset_css_alloc - allocate a cpuset css * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1864,7 +1864,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup *cont) +static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); @@ -1878,8 +1878,8 @@ static void cpuset_destroy(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", - .create = cpuset_create, - .destroy = cpuset_destroy, + .css_alloc = cpuset_css_alloc, + .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .post_clone = cpuset_post_clone, diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134d..f9ff5493171d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7434,7 +7434,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) { struct perf_cgroup *jc; @@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_destroy(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, struct cgroup_subsys perf_subsys = { .name = "perf_event", .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, + .css_alloc = perf_cgroup_css_alloc, + .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..6f20c8fb2326 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7468,7 +7468,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7485,7 +7485,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) return &tg->css; } -static void cpu_cgroup_destroy(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7845,8 +7845,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, + .css_alloc = cpu_cgroup_css_alloc, + .css_free = cpu_cgroup_css_free, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, @@ -7869,7 +7869,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { struct cpuacct root_cpuacct; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7899,7 +7899,7 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_destroy(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); @@ -8070,8 +8070,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, .subsys_id = cpuacct_subsys_id, .base_cftypes = files, }; -- cgit v1.2.3 From 2260e7fc1f18ad815324605c1ce7d5c6fd9b19a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ clone_children is only meaningful for cpuset and will stay that way. Rename the flag to reflect that and update documentation. Also, drop clone_children() wrapper in cgroup.c. The thin wrapper is used only a few times and one of them will go away soon. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d35463bab487..2895880e6800 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -296,11 +296,6 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1101,7 +1096,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (clone_children(&root->top_cgroup)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1113,7 +1108,7 @@ struct cgroup_sb_opts { unsigned long subsys_mask; unsigned long flags; char *release_agent; - bool clone_children; + bool cpuset_clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1164,7 +1159,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "clone_children")) { - opts->clone_children = true; + opts->cpuset_clone_children = true; continue; } if (!strcmp(token, "xattr")) { @@ -1474,8 +1469,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); - if (opts->clone_children) - set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); + if (opts->cpuset_clone_children) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -3905,7 +3900,7 @@ fail: static u64 cgroup_clone_children_read(struct cgroup *cgrp, struct cftype *cft) { - return clone_children(cgrp); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); } static int cgroup_clone_children_write(struct cgroup *cgrp, @@ -3913,9 +3908,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, u64 val) { if (val) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); else - clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); return 0; } @@ -4130,8 +4125,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - if (clone_children(parent)) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { struct cgroup_subsys_state *css; @@ -4148,7 +4143,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } /* At error, ->css_free() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags) && + ss->post_clone) ss->post_clone(cgrp); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && -- cgit v1.2.3 From 033fa1c5f5f73833598a0beb022c0048fb769dad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:39 -0800 Subject: cgroup, cpuset: remove cgroup_subsys->post_clone() Currently CGRP_CPUSET_CLONE_CHILDREN triggers ->post_clone(). Now that clone_children is cpuset specific, there's no reason to have this rather odd option activation mechanism in cgroup core. cpuset can check the flag from its ->css_allocate() and take the necessary action. Move cpuset_post_clone() logic to the end of cpuset_css_alloc() and remove cgroup_subsys->post_clone(). Loosely based on Glauber's "generalize post_clone into post_create" patch. Signed-off-by: Tejun Heo Original-patch-by: Glauber Costa Original-patch: <1351686554-22592-2-git-send-email-glommer@parallels.com> Acked-by: Serge E. Hallyn Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 4 --- kernel/cpuset.c | 80 ++++++++++++++++++++++++++------------------------------- 2 files changed, 36 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2895880e6800..96719f73e95d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4142,10 +4142,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - /* At error, ->css_free() callback has to free assigned ID. */ - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags) && - ss->post_clone) - ss->post_clone(cgrp); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 06931337c4e5..b017887d632f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1783,43 +1783,6 @@ static struct cftype files[] = { { } /* terminate */ }; -/* - * post_clone() is called during cgroup_create() when the - * clone_children mount argument was specified. The cgroup - * can not yet have any tasks. - * - * Currently we refuse to set up the cgroup - thereby - * refusing the task to be entered, and as a result refusing - * the sys_unshare() or clone() which initiated it - if any - * sibling cpusets have exclusive cpus or mem. - * - * If this becomes a problem for some users who wish to - * allow that scenario, then cpuset_post_clone() could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. Called with cgroup_mutex - * held. - */ -static void cpuset_post_clone(struct cgroup *cgroup) -{ - struct cgroup *parent, *child; - struct cpuset *cs, *parent_cs; - - parent = cgroup->parent; - list_for_each_entry(child, &parent->children, sibling) { - cs = cgroup_cs(child); - if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) - return; - } - cs = cgroup_cs(cgroup); - parent_cs = cgroup_cs(parent); - - mutex_lock(&callback_mutex); - cs->mems_allowed = parent_cs->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); - mutex_unlock(&callback_mutex); - return; -} - /* * cpuset_css_alloc - allocate a cpuset css * cont: control group that the new cpuset will be part of @@ -1827,13 +1790,14 @@ static void cpuset_post_clone(struct cgroup *cgroup) static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cpuset *cs; - struct cpuset *parent; + struct cgroup *parent_cg = cont->parent; + struct cgroup *tmp_cg; + struct cpuset *parent, *cs; - if (!cont->parent) { + if (!parent_cg) return &top_cpuset.css; - } - parent = cgroup_cs(cont->parent); + parent = cgroup_cs(parent_cg); + cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); @@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cs->parent = parent; number_of_cpusets++; - return &cs->css ; + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) + goto skip_clone; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * histrical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { + struct cpuset *tmp_cs = cgroup_cs(tmp_cg); + + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) + goto skip_clone; + } + + mutex_lock(&callback_mutex); + cs->mems_allowed = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + mutex_unlock(&callback_mutex); +skip_clone: + return &cs->css; } /* @@ -1882,7 +1875,6 @@ struct cgroup_subsys cpuset_subsys = { .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, - .post_clone = cpuset_post_clone, .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, -- cgit v1.2.3 From 0a950f65e1e64f4e82b4b5507773848ea88bcb8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 09:02:12 -0800 Subject: cgroup: add cgroup->id With the introduction of generic cgroup hierarchy iterators, css_id is being phased out. It was unnecessarily complex, id'ing the wrong thing (cgroups need IDs, not CSSes) and has other oddities like not being available at ->css_alloc(). This patch adds cgroup->id, which is a simple per-hierarchy ida-allocated ID which is assigned before ->css_alloc() and released after ->css_free(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman --- kernel/cgroup.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 96719f73e95d..6db01417d39a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,6 +138,9 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -890,6 +893,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) simple_xattrs_free(&cgrp->xattrs); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); @@ -1465,6 +1469,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; + ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1483,6 +1488,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); spin_unlock(&hierarchy_id_lock); + ida_destroy(&root->cgroup_ida); kfree(root); } @@ -4093,10 +4099,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys *ss; struct super_block *sb = root->sb; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) + goto err_free_cgrp; + /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4106,7 +4117,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_cgrp; + goto err_free_id; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4198,6 +4209,8 @@ err_free_all: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); +err_free_id: + ida_simple_remove(&root->cgroup_ida, cgrp->id); err_free_cgrp: kfree(cgrp); return err; -- cgit v1.2.3 From 717a9ef7f355480686cdbac3f32d6075437a923e Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 18 Jul 2012 11:56:01 -0700 Subject: tracing: Remove unneeded checks from the stack tracer It seems that 'ftrace_enabled' flag should not be used inside the tracer functions. The ftrace core is using this flag for internal purposes, and the flag wasn't meant to be used in tracers' runtime checks. stack tracer is the only tracer that abusing the flag. So stop it from serving as a bad example. Also, there is a local 'stack_trace_disabled' flag in the stack tracer, which is never updated; so it can be removed as well. Link: http://lkml.kernel.org/r/1342637761-9655-1-git-send-email-anton.vorontsov@linaro.org Signed-off-by: Anton Vorontsov Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0c1b165778e5..42ca822fc701 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -33,7 +33,6 @@ static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); static DEFINE_MUTEX(stack_sysctl_mutex); @@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, { int cpu; - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - preempt_disable_notrace(); cpu = raw_smp_processor_id(); -- cgit v1.2.3 From bf3071f5a054db9e5bab873355d27a7330ce5187 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 25 Jul 2012 11:39:08 -0400 Subject: tracing: Remove unnecessary WARN_ONCE's from tracing_buffers_splice_read WARN shouldn't be used as a means of communicating failure to a userspace programmer. Link: http://lkml.kernel.org/r/20120725153908.GA25203@redhat.com Signed-off-by: Dave Jones Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 64ad9bc4275b..5bc35907cc6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4275,13 +4275,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -ENOMEM; if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); ret = -EINVAL; goto out; } if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); if (len < PAGE_SIZE) { ret = -EINVAL; goto out; -- cgit v1.2.3 From 37657da3c5d4a3bbbbb9d3b78f53a8134a0abae0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 27 Jul 2012 06:21:27 -0700 Subject: userns: Allow setting a userns mapping to your current uid. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 456a6b9fba34..49096d559e08 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -709,6 +709,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow mapping to your own filesystem ids */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); + if (uid_eq(uid, current_fsuid())) + return true; + } + else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (gid_eq(gid, current_fsgid())) + return true; + } + } + /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; -- cgit v1.2.3 From b33c77ef23dd3ec5692c9c0cc739a3f5f0f2baae Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 00:50:47 -0700 Subject: userns: Allow unprivileged users to create new namespaces If an unprivileged user has the appropriate capabilities in their current user namespace allow the creation of new namespaces. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 7f8b051fc19f..a214e0e9035f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -122,6 +122,7 @@ out_ns: int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -134,7 +135,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -191,7 +192,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!capable(CAP_SYS_ADMIN)) + if (!nsown_capable(CAP_SYS_ADMIN)) return -EPERM; *new_nsp = create_new_namespaces(unshare_flags, current, -- cgit v1.2.3 From 142e1d1d5f088e7a38659daca6e84a730967774a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 01:13:20 -0700 Subject: userns: Allow unprivileged use of setns. - Push the permission check from the core setns syscall into the setns install methods where the user namespace of the target namespace can be determined, and used in a ns_capable call. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 3 --- kernel/utsname.c | 7 ++++++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a214e0e9035f..4357a0a7d17d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -242,9 +242,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3fd..4a9362f9325d 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -102,8 +102,13 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *ns) +static int utsns_install(struct nsproxy *nsproxy, void *new) { + struct uts_namespace *ns = new; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; -- cgit v1.2.3 From bcf58e725ddc45d31addbc6627d4f0edccc824c1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 04:02:49 -0700 Subject: userns: Make create_new_namespaces take a user_ns parameter Modify create_new_namespaces to explicitly take a user namespace parameter, instead of implicitly through the task_struct. This allows an implementation of unshare(CLONE_NEWUSER) where the new user namespace is not stored onto the current task_struct until after all of the namespaces are created. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 22 +++++++++++++--------- kernel/utsname.c | 9 ++++----- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 4357a0a7d17d..2ddd81657a2a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) + struct task_struct *tsk, struct user_namespace *user_ns, + struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, task_cred_xxx(tsk, user_ns), new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk); + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk); + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns); + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -152,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, tsk->fs); + new_ns = create_new_namespaces(flags, tsk, + task_cred_xxx(tsk, user_ns), tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -186,6 +188,7 @@ void free_nsproxy(struct nsproxy *ns) int unshare_nsproxy_namespaces(unsigned long unshare_flags, struct nsproxy **new_nsp, struct fs_struct *new_fs) { + struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | @@ -195,7 +198,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, if (!nsown_capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, + user_ns = current_user_ns(); + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); @@ -252,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/kernel/utsname.c b/kernel/utsname.c index 4a9362f9325d..fdc619eb61ef 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,7 +32,7 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; } @@ -55,9 +55,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -66,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(tsk, old_ns); + new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; -- cgit v1.2.3 From 4c44aaafa8108f584831850ab48a975e971db2de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 05:05:21 -0700 Subject: userns: Kill task_user_ns The task_user_ns function hides the fact that it is getting the user namespace from struct cred on the task. struct cred may go away as soon as the rcu lock is released. This leads to a race where we can dereference a stale user namespace pointer. To make it obvious a struct cred is involved kill task_user_ns. To kill the race modify the users of task_user_ns to only reference the user namespace while the rcu lock is held. Cc: Kees Cook Cc: James Morris Acked-by: Kees Cook Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 10 ++++++++-- kernel/sched/core.c | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f5e55dda955..7b09b88862cc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -215,8 +215,12 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + rcu_read_lock(); + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; + rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..2f5eb1838b3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4029,8 +4029,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) -- cgit v1.2.3 From cde1975bc242f3e1072bde623ef378e547b73f91 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 06:24:06 -0700 Subject: userns: Implent proc namespace operations This allows entering a user namespace, and the ability to store a reference to a user namespace with a bind mount. Addition of missing userns_ns_put in userns_install from Gao feng Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 90 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 49096d559e08..a9460774e77d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) +{ + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + cred->securebits = SECUREBITS_DEFAULT; + cred->cap_inheritable = CAP_EMPTY_SET; + cred->cap_permitted = CAP_FULL_SET; + cred->cap_effective = CAP_FULL_SET; + cred->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(cred->request_key_auth); + cred->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + cred->user_ns = user_ns; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -53,27 +72,12 @@ int create_user_ns(struct cred *new) return -ENOMEM; kref_init(&ns->kref); + /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - new->securebits = SECUREBITS_DEFAULT; - new->cap_inheritable = CAP_EMPTY_SET; - new->cap_permitted = CAP_FULL_SET; - new->cap_effective = CAP_FULL_SET; - new->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* Leave the new->user_ns reference with the new user namespace. */ - /* Leave the reference to our user_ns with the new cred. */ - new->user_ns = ns; + set_cred_user_ns(new, ns); return 0; } @@ -737,6 +741,58 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } +static void *userns_get(struct task_struct *task) +{ + struct user_namespace *user_ns; + + rcu_read_lock(); + user_ns = get_user_ns(__task_cred(task)->user_ns); + rcu_read_unlock(); + + return user_ns; +} + +static void userns_put(void *ns) +{ + put_user_ns(ns); +} + +static int userns_install(struct nsproxy *nsproxy, void *ns) +{ + struct user_namespace *user_ns = ns; + struct cred *cred; + + /* Don't allow gaining capabilities by reentering + * the same user namespace. + */ + if (user_ns == current_user_ns()) + return -EINVAL; + + /* Threaded many not enter a different user namespace */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + put_user_ns(cred->user_ns); + set_cred_user_ns(cred, get_user_ns(user_ns)); + + return commit_creds(cred); +} + +const struct proc_ns_operations userns_operations = { + .name = "user", + .type = CLONE_NEWUSER, + .get = userns_get, + .put = userns_put, + .install = userns_install, +}; + static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); -- cgit v1.2.3 From b2e0d98705e60e45bbb3c0032c48824ad7ae0704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 05:15:35 -0700 Subject: userns: Implement unshare of the user namespace - Add CLONE_THREAD to the unshare flags if CLONE_NEWUSER is selected As changing user namespaces is only valid if all there is only a single thread. - Restore the code to add CLONE_VM if CLONE_THREAD is selected and the code to addCLONE_SIGHAND if CLONE_VM is selected. Making the constraints in the code clear. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 25 ++++++++++++++++++++++--- kernel/nsproxy.c | 8 ++++---- kernel/user_namespace.c | 15 +++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8c29abb19014..38e53b87402c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1687,7 +1687,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1754,10 +1754,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; /* * If unsharing a pid namespace must also unshare the thread. */ @@ -1795,11 +1801,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1832,11 +1842,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ddd81657a2a..78e2ecb20165 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -186,7 +186,7 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; int err = 0; @@ -195,12 +195,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!nsown_capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - user_ns = current_user_ns(); *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, - new_fs ? new_fs : current->fs); + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9460774e77d..ce92f7e6290a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -82,6 +82,21 @@ int create_user_ns(struct cred *new) return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = -- cgit v1.2.3 From c450f371d48557e3e0fa510a4af27b92f0d8c4cc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 14 Aug 2012 21:25:13 -0700 Subject: userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file To keep things sane in the context of file descriptor passing derive the user namespace that uids are mapped into from the opener of the file instead of from current. When writing to the maps file the lower user namespace must always be the parent user namespace, or setting the mapping simply does not make sense. Enforce that the opener of the file was in the parent user namespace or the user namespace whose mapping is being set. Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ce92f7e6290a..89f6eaed067a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -391,7 +391,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -412,7 +412,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -688,10 +688,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -700,10 +704,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } -- cgit v1.2.3 From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 10:21:48 -0700 Subject: proc: Usable inode numbers for the namespace file descriptors. Assign a unique proc inode to each namespace, and use that inode number to ensure we only allocate at most one proc inode for every namespace in proc. A single proc inode per namespace allows userspace to test to see if two processes are in the same namespace. This has been a long requested feature and only blocked because a naive implementation would put the id in a global space and would ultimately require having a namespace for the names of namespaces, making migration and certain virtualization tricks impossible. We still don't have per superblock inode numbers for proc, which appears necessary for application unaware checkpoint/restart and migrations (if the application is using namespace file descriptors) but that is now allowd by the design if it becomes important. I have preallocated the ipc and uts initial proc inode numbers so their structures can be statically initialized. Signed-off-by: Eric W. Biederman --- kernel/pid.c | 1 + kernel/pid_namespace.c | 12 ++++++++++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 15 +++++++++++++++ kernel/utsname.c | 17 ++++++++++++++++- 5 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 6e8da291de49..3026ddae0a34 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -80,6 +80,7 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 68508d330634..560da0dab230 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,6 +107,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); @@ -133,6 +137,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -345,12 +350,19 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9ec..33acb5e53a5f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 89f6eaed067a..f5975ccf9348 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -71,6 +72,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; @@ -103,6 +110,7 @@ void free_user_ns(struct kref *kref) container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -808,12 +816,19 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index fdc619eb61ef..f6336d51d64c 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -36,11 +36,18 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -77,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -114,11 +122,18 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - -- cgit v1.2.3 From d0b2fdd2a51203f04ea0a5d716e033c15e0231af Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 20 Nov 2012 22:06:18 +0800 Subject: cgroup: remove obsolete guarantee from cgroup_task_migrate. 'guarantee' is already removed from cgroup_task_migrate, so remove the corresponding comments. Some other typos in cgroup are also changed. Cc: Tejun Heo Cc: Li Zefan Signed-off-by: Tao Ma Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6db01417d39a..55a0a770a5a2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -782,12 +782,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * The task_lock() exception * * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one task's cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -1882,9 +1882,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); /* * cgroup_task_migrate - move a task from one cgroup to another. * - * 'guarantee' is set if the caller promises that a new css_set for the task - * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex and threadgroup locked. */ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) -- cgit v1.2.3 From 8ffeb9b0e6369135bf03a073514f571ef10606b9 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 26 Nov 2012 16:29:54 -0800 Subject: watchdog: using u64 in get_sample_period() In get_sample_period(), unsigned long is not enough: watchdog_thresh * 2 * (NSEC_PER_SEC / 5) case1: watchdog_thresh is 10 by default, the sample value will be: 0xEE6B2800 case2: set watchdog_thresh is 20, the sample value will be: 0x1 DCD6 5000 In case2, we need use u64 to express the sample period. Otherwise, changing the threshold thru proc often can not be successful. Signed-off-by: liu chuansheng Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9d4c8d5a1f53..dd4b80a9f1a9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -116,7 +116,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static unsigned long get_sample_period(void) +static u64 get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -125,7 +125,7 @@ static unsigned long get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ -- cgit v1.2.3 From aa10990e028cac3d5e255711fb9fb47e00700e35 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 26 Nov 2012 16:29:56 -0800 Subject: futex: avoid wake_futex() for a PI futex_q Dave Jones reported a bug with futex_lock_pi() that his trinity test exposed. Sometime between queue_me() and taking the q.lock_ptr, the lock_ptr became NULL, resulting in a crash. While futex_wake() is careful to not call wake_futex() on futex_q's with a pi_state or an rt_waiter (which are either waiting for a futex_unlock_pi() or a PI futex_requeue()), futex_wake_op() and futex_requeue() do not perform the same test. Update futex_wake_op() and futex_requeue() to test for q.pi_state and q.rt_waiter and abort with -EINVAL if detected. To ensure any future breakage is caught, add a WARN() to wake_futex() if the same condition is true. This fix has seen 3 hours of testing with "trinity -c futex" on an x86_64 VM with 4 CPUS. [akpm@linux-foundation.org: tidy up the WARN()] Signed-off-by: Darren Hart Reported-by: Dave Jones Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: John Kacur Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 20ef219bbe9b..19eb089ca003 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -843,6 +843,9 @@ static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + /* * We set q->lock_ptr = NULL _before_ we wake up the task. If * a non-futex wake up happens on another CPU then the task @@ -1078,6 +1081,10 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++ret >= nr_wake) break; @@ -1090,6 +1097,10 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++op_ret >= nr_wake2) break; @@ -1098,6 +1109,7 @@ retry_private: ret += op_ret; } +out_unlock: double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(&key2); @@ -1387,9 +1399,13 @@ retry_private: /* * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { + (!requeue_pi && this->rt_waiter) || + this->pi_state) { ret = -EINVAL; break; } -- cgit v1.2.3 From 582b336ec2c0f0076f5650a029fcc9abd4a906f7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:54 -0200 Subject: sched: add notifier for cross-cpu migrations Originally from Jeremy Fitzhardinge. Acked-by: Ingo Molnar Signed-off-by: Marcelo Tosatti --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..c86b8b6e70f4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -922,6 +922,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -952,8 +959,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); -- cgit v1.2.3 From e0b306fef90556233797d2e1747bd6a3ae35ea93 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:59 -0200 Subject: time: export time information for KVM pvclock As suggested by John, export time data similarly to how its done by vsyscall support. This allows KVM to retrieve necessary information to implement vsyscall support in KVM guests. Acked-by: John Stultz Signed-off-by: Marcelo Tosatti --- kernel/time/timekeeping.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e424970bb562..69f5342e8d1c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct timekeeper timekeeper; @@ -180,6 +181,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + /* update timekeeping data */ + update_pvclock_gtod(tk); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + * + * Must hold write on timekeeper.lock + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + int ret; + + write_seqlock_irqsave(&tk->lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + write_sequnlock_irqrestore(&tk->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -188,6 +237,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) ntp_clear(); } update_vsyscall(tk); + update_pvclock_gtod(tk); } /** -- cgit v1.2.3 From fddfb02ad0d0d3b479c2a26a8ae7e6411b34706b Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 28 Nov 2012 17:15:21 +0800 Subject: cgroup: move list add after list head initilization 2243076ad1 ("cgroup: initialize cgrp->allcg_node in init_cgroup_housekeeping()") initializes cgrp->allcg_node in init_cgroup_housekeeping(). Then in init_cgroup_root(), we should call init_cgroup_housekeeping() before adding it to &root->allcg_list; otherwise, we are initializing an entry already in a list. Signed-off-by: Li Zhong Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 55a0a770a5a2..c02b05560d10 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1401,8 +1401,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static bool init_root_id(struct cgroupfs_root *root) -- cgit v1.2.3 From a634f93335daa8f38180a0e576ccd68a73c36eaf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 15:55:59 +0100 Subject: cputime: Move thread_group_cputime() to sched code thread_group_cputime() is a general cputime API that is not only used by posix cpu timer. Let's move this helper to sched code. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/posix-cpu-timers.c | 24 ------------------------ kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..d73840271dce 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { if (b->utime > a->utime) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8d859dae5bed..e56f138a23c7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) return false; } +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING -- cgit v1.2.3 From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/exit.c | 4 ++-- kernel/sched/cputime.c | 8 ++++---- kernel/sys.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..618f7ee56003 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e56f138a23c7..7dc155371b95 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -445,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -516,7 +516,7 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -543,7 +543,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct signal_struct *sig = p->signal; struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; -- cgit v1.2.3 From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..0e7cdb90476f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b95..220fdc4db770 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif -- cgit v1.2.3 From fa09205783d11cc05122ad6e4ce06074624b2c0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 Nov 2012 17:00:57 +0100 Subject: cputime: Comment cputime's adjusting code The reason for the scaling and monotonicity correction performed by cputime_adjust() may not be immediately clear to the reviewer. Add some comments to explain what happens there. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/sched/cputime.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 220fdc4db770..b7f731768625 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,6 +516,10 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) @@ -524,8 +528,16 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; total = utime + curr->stime; + /* - * Use CFS's precise accounting: + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); @@ -535,7 +547,9 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ prev->utime = max(prev->utime, utime); prev->stime = max(prev->stime, rtime - prev->utime); -- cgit v1.2.3 From 3b572b506c76a7a38c8debe45b50d09295620810 Mon Sep 17 00:00:00 2001 From: Bill Pemberton Date: Mon, 19 Nov 2012 13:19:29 -0500 Subject: sysctl: remove CONFIG_HOTPLUG ifdefs Remove conditional code based on CONFIG_HOTPLUG being false. It's always on now in preparation of it going away as an option. Signed-off-by: Bill Pemberton Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..33f71f37267e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -565,7 +565,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#ifdef CONFIG_HOTPLUG + { .procname = "hotplug", .data = &uevent_helper, @@ -573,7 +573,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, -#endif + #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", -- cgit v1.2.3 From e3a1a5ec5cd516c85848611992102fc00e8af601 Mon Sep 17 00:00:00 2001 From: Bill Pemberton Date: Mon, 19 Nov 2012 13:19:30 -0500 Subject: kernel/ksysfs.c: remove CONFIG_HOTPLUG ifdefs Remove conditional code based on CONFIG_HOTPLUG being false. It's always on now in preparation of it going away as an option. Signed-off-by: Bill Pemberton Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 4e316e1acf58..c01eac66c0cc 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(uevent_helper); -#endif + #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, @@ -169,10 +168,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, -#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, -#endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -- cgit v1.2.3 From 205a872bd6f9a9a09ef035ef1e90185a8245cc58 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 28 Nov 2012 13:50:44 -0800 Subject: cgroup: fix lockdep warning for event_control The cgroup_event_wake() function is called with the wait queue head locked and it takes cgrp->event_list_lock. However, in cgroup_rmdir() remove_wait_queue() was being called after taking cgrp->event_list_lock. Correct the lock ordering by using a temporary list to obtain the event list to remove from the wait queue. Signed-off-by: Greg Thelen Signed-off-by: Aaron Durbin Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02b05560d10..589433f7a74b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4277,6 +4277,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; + LIST_HEAD(tmp_list); lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4331,16 +4332,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace + * directory to avoid race between userspace and kernelspace. Use + * a temporary list to avoid a deadlock with cgroup_event_wake(). Since + * cgroup_event_wake() is called with the wait queue head locked, + * remove_wait_queue() cannot be called while holding event_list_lock. */ spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_splice_init(&cgrp->event_list, &tmp_list); + spin_unlock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &tmp_list, list) { list_del(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); return 0; } -- cgit v1.2.3 From 9718ceb3431acdeee9bdec5c18e18266333970aa Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 28 Nov 2012 13:50:45 -0800 Subject: cgroup: list_del_init() on removed events Use list_del_init() rather than list_del() to remove events from cgrp->event_list. No functional change. This is just defensive coding. Signed-off-by: Greg Thelen Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 589433f7a74b..46dcdbd7485b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3767,7 +3767,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, if (flags & POLLHUP) { __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); - list_del(&event->list); + list_del_init(&event->list); spin_unlock(&cgrp->event_list_lock); /* * We are in atomic context, but cgroup_event_remove() may @@ -4341,7 +4341,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_splice_init(&cgrp->event_list, &tmp_list); spin_unlock(&cgrp->event_list_lock); list_for_each_entry_safe(event, tmp, &tmp_list, list) { - list_del(&event->list); + list_del_init(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); -- cgit v1.2.3 From c4144670fd9b34d6eae22c9f83751745898e8243 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 Oct 2012 16:34:38 -0400 Subject: kill daemonize() Signed-off-by: Al Viro --- kernel/exit.c | 92 ----------------------------------------------------------- 1 file changed, 92 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..f9275e2c7c2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -322,43 +322,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); - write_unlock_irq(&tasklist_lock); -} - void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; @@ -370,13 +333,6 @@ void __set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - /* * Let kernel threads use this to say that they allow a certain signal. * Must not be used if kthread was cloned with CLONE_SIGHAND. @@ -416,54 +372,6 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to get frozen, in case system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - daemonize_fs_struct(); - daemonize_descriptors(); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - #ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. -- cgit v1.2.3 From d2125043aebf7f53cd1c72115c17b01d0bc06ce1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 23 Oct 2012 13:17:59 -0400 Subject: generic sys_fork / sys_vfork / sys_clone ... and get rid of idiotic struct pt_regs * in asm-generic/syscalls.h prototypes of the same, while we are at it. Eventually we want those in linux/syscalls.h, of course, but that'll have to wait a bit. Note that there are *three* variants of sys_clone() order of arguments. Braindamage galore... Signed-off-by: Al Viro --- kernel/fork.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..27a337549dab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1645,6 +1645,49 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) } #endif +#ifdef __ARCH_WANT_SYS_FORK +SYSCALL_DEFINE0(fork) +{ +#ifdef CONFIG_MMU + return do_fork(SIGCHLD, 0, current_pt_regs(), 0, NULL, NULL); +#else + /* can not support in nommu mode */ + return(-EINVAL); +#endif +} +#endif + +#ifdef __ARCH_WANT_SYS_VFORK +SYSCALL_DEFINE0(vfork) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, current_pt_regs(), + 0, NULL, NULL); +} +#endif + +#ifdef __ARCH_WANT_SYS_CLONE +#ifdef CONFIG_CLONE_BACKWARDS +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int, tls_val, + int __user *, child_tidptr) +#elif defined(CONFIG_CLONE_BACKWARDS2) +SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#else +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#endif +{ + return do_fork(clone_flags, newsp, current_pt_regs(), 0, + parent_tidptr, child_tidptr); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -- cgit v1.2.3 From c62d773a3751610010feb574d859f58de4a51eba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 15:07:18 -0400 Subject: audit: no nested contexts anymore... Signed-off-by: Al Viro --- kernel/auditsc.c | 102 ++++++++++++------------------------------------------- 1 file changed, 21 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2f186ed80c40..c8ca7fafbcc9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -200,7 +200,6 @@ struct audit_context { struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; - struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; struct sockaddr_storage *sockaddr; @@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { - struct audit_context *previous; - int count = 0; - - do { - previous = context->previous; - if (previous || (count && count < 10)) { - ++count; - printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" - " freeing multiple contexts (%d)\n", - context->serial, context->major, - context->name_count, count); - } - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - free_tree_refs(context); - audit_free_aux(context); - kfree(context->filterkey); - kfree(context->sockaddr); - kfree(context); - context = previous; - } while (context); - if (count >= 10) - printk(KERN_ERR "audit: freed %d contexts\n", count); + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + free_tree_refs(context); + audit_free_aux(context); + kfree(context->filterkey); + kfree(context->sockaddr); + kfree(context); } void audit_log_task_context(struct audit_buffer *ab) @@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major, if (!context) return; - /* - * This happens only on certain architectures that make system - * calls in kernel_thread via the entry.S interface, instead of - * with direct calls. (If you are porting to a new - * architecture, hitting this condition can indicate that you - * got the _exit/_leave calls backward in entry.S.) - * - * i386 no - * x86_64 no - * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) - * - * This also happens with vm86 emulation in a non-nested manner - * (entries without exits), so this case must be caught. - */ - if (context->in_syscall) { - struct audit_context *newctx; - -#if AUDIT_DEBUG - printk(KERN_ERR - "audit(:%d) pid=%d in syscall=%d;" - " entering syscall=%d\n", - context->serial, tsk->pid, context->major, major); -#endif - newctx = audit_alloc_context(context->state); - if (newctx) { - newctx->previous = context; - context = newctx; - tsk->audit_context = newctx; - } else { - /* If we can't alloc a new context, the best we - * can do is to leak memory (any pending putname - * will be lost). The only other alternative is - * to abandon auditing. */ - audit_zero_context(context, context->state); - } - } BUG_ON(context->in_syscall || context->name_count); if (!audit_enabled) @@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(&context->killed_trees); - if (context->previous) { - struct audit_context *new_context = context->previous; - context->previous = NULL; - audit_free_context(context); - tsk->audit_context = new_context; - } else { - audit_free_names(context); - unroll_tree_refs(context, NULL, 0); - audit_free_aux(context); - context->aux = NULL; - context->aux_pids = NULL; - context->target_pid = 0; - context->target_sid = 0; - context->sockaddr_len = 0; - context->type = 0; - context->fds[0] = -1; - if (context->state != AUDIT_RECORD_CONTEXT) { - kfree(context->filterkey); - context->filterkey = NULL; - } - tsk->audit_context = context; + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + audit_free_aux(context); + context->aux = NULL; + context->aux_pids = NULL; + context->target_pid = 0; + context->target_sid = 0; + context->sockaddr_len = 0; + context->type = 0; + context->fds[0] = -1; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; } + tsk->audit_context = context; } static inline void handle_one(const struct inode *inode) -- cgit v1.2.3 From afa86fc426ff7e7f5477f15da9c405d08d5cf790 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:51:14 -0400 Subject: flagday: don't pass regs to copy_thread() Signed-off-by: Al Viro --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 27a337549dab..d96a562b1311 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1320,7 +1320,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3 From 62e791c1b8ea481c72c299dee4f62c04aaef765c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:52:26 -0400 Subject: don't pass regs to copy_process() Signed-off-by: Al Viro --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d96a562b1311..fa24a78b94f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1127,7 +1127,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, @@ -1536,8 +1535,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1596,7 +1594,7 @@ long do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, regs, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer -- cgit v1.2.3 From 18c26c27ae0abe82253cb2e2363df465dbbb657e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:53:20 -0400 Subject: death to idle_regs() Signed-off-by: Al Viro --- kernel/fork.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index fa24a78b94f2..0e68b6686acb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1514,12 +1514,6 @@ fork_out: return ERR_PTR(retval); } -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - return regs; -} - static inline void init_idle_pids(struct pid_link *links) { enum pid_type type; -- cgit v1.2.3 From e80d6661c3a5caa0cebec0853c6cb0db090fb506 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 23:10:08 -0400 Subject: flagday: kill pt_regs argument of do_fork() Signed-off-by: Al Viro --- kernel/fork.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0e68b6686acb..540730783433 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,8 +1527,6 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; - struct pt_regs regs; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); @@ -1546,7 +1544,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) */ long do_fork(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) @@ -1576,7 +1573,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { + if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1632,7 +1629,7 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } #endif @@ -1641,7 +1638,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, current_pt_regs(), 0, NULL, NULL); + return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ return(-EINVAL); @@ -1652,7 +1649,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, current_pt_regs(), + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif @@ -1675,7 +1672,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, current_pt_regs(), 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif -- cgit v1.2.3 From b7f9591c44505ee16ed4561cfeb3642798bdd132 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2012 13:06:22 -0500 Subject: get rid of ptrace_signal_deliver() arguments the first one is equal to signal_pt_regs(), the second is never used (and always NULL, while we are at it). Signed-off-by: Al Viro --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..17d4e17fd614 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2141,7 +2141,7 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - ptrace_signal_deliver(regs, cookie); + ptrace_signal_deliver(); /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will -- cgit v1.2.3 From 94eb22d5056b5eca10c587bf064e0e6db0ae6161 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2012 13:08:06 -0500 Subject: ptrace_signal(): get rid of unused arguments Signed-off-by: Al Viro --- kernel/signal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 17d4e17fd614..421d5b6bcdbc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2138,8 +2138,7 @@ static void do_jobctl_trap(void) } } -static int ptrace_signal(int signr, siginfo_t *info, - struct pt_regs *regs, void *cookie) +static int ptrace_signal(int signr, siginfo_t *info) { ptrace_signal_deliver(); /* @@ -2265,8 +2264,7 @@ relock: break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); + signr = ptrace_signal(signr, info); if (!signr) continue; } -- cgit v1.2.3 From 4aaefee589f97bb30062bbeeb793bbbf7107a9a7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2012 13:09:56 -0500 Subject: print_fatal_signal(): get rid of pt_regs argument Signed-off-by: Al Viro --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 421d5b6bcdbc..2e3b5d572808 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1159,8 +1159,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, return __send_signal(sig, info, t, group, from_ancestor_ns); } -static void print_fatal_signal(struct pt_regs *regs, int signr) +static void print_fatal_signal(int signr) { + struct pt_regs *regs = signal_pt_regs(); printk("%s/%d: potentially unexpected fatal signal %d.\n", current->comm, task_pid_nr(current), signr); @@ -2349,7 +2350,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(regs, info->si_signo); + print_fatal_signal(info->si_signo); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with -- cgit v1.2.3 From 541880d9a2c7871f6370071d55aa6662d329c51e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Nov 2012 13:11:26 -0500 Subject: do_coredump(): get rid of pt_regs argument Signed-off-by: Al Viro --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2e3b5d572808..e75e4bd2839b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2359,7 +2359,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info, regs); + do_coredump(info); } /* -- cgit v1.2.3 From d202b7b970b2a21278505c2467919fd441a7b6c8 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 27 Nov 2012 01:20:32 +0100 Subject: irqdomain: stop screaming about preallocated irqdescs In the simple irqdomain: don't shout warnings to the user, there is no point. An informational print is sufficient. Signed-off-by: Linus Walleij Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4e69e24d3d7d..96f3a1d9c379 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, irq_base = irq_alloc_descs(first_irq, first_irq, size, of_node_to_nid(of_node)); if (irq_base < 0) { - WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", - first_irq); + pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); irq_base = first_irq; } } else -- cgit v1.2.3 From 1f869e8711d18aaf6e2979922bc9377ad394b82f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 30 Nov 2012 17:31:23 +0400 Subject: cgroup: warn about broken hierarchies only after css_online If everything goes right, it shouldn't really matter if we are spitting this warning after css_alloc or css_online. If we fail between then, there are some ill cases where we would previously see the message and now we won't (like if the files fail to be created). I believe it really shouldn't matter: this message is intended in spirit to be shown when creation succeeds, but with insane settings. Signed-off-by: Glauber Costa Signed-off-by: Tejun Heo --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 46dcdbd7485b..b186a7e25b0a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4151,15 +4151,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } } /* @@ -4188,6 +4179,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = online_css(ss, cgrp); if (err) goto err_destroy; + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); -- cgit v1.2.3 From 54f7be5b831254199522523ccab4c3d954bbf576 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Nov 2012 22:27:22 -0500 Subject: ring-buffer: Fix NULL pointer if rb_set_head_page() fails The function rb_set_head_page() searches the list of ring buffer pages for a the page that has the HEAD page flag set. If it does not find it, it will do a WARN_ON(), disable the ring buffer and return NULL, as this should never happen. But if this bug happens to happen, not all callers of this function can handle a NULL pointer being returned from it. That needs to be fixed. Cc: stable@vger.kernel.org # 3.0+ Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b979426d16c6..ec01803e0a55 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1396,6 +1396,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head_page_with_bit; head_page = &rb_set_head_page(cpu_buffer)->list; + if (!head_page) + break; prev_page = head_page->prev; first_page = pages->next; @@ -2934,7 +2936,7 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long ret; + unsigned long ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; @@ -2949,7 +2951,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); - ret = bpage->page->time_stamp; + if (bpage) + ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; @@ -3260,6 +3263,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); + if (!reader) + goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; -- cgit v1.2.3 From 9366c1ba13fbc41bdb57702e75ca4382f209c82f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 Nov 2012 22:31:16 -0500 Subject: ring-buffer: Fix race between integrity check and readers The function rb_check_pages() was added to make sure the ring buffer's pages were sane. This check is done when the ring buffer size is modified as well as when the iterator is released (closing the "trace" file), as that was considered a non fast path and a good place to do a sanity check. The problem is that the check does not have any locks around it. If one process were to read the trace file, and another were to read the raw binary file, the check could happen while the reader is reading the file. The issues with this is that the check requires to clear the HEAD page before doing the full check and it restores it afterward. But readers require the HEAD page to exist before it can read the buffer, otherwise it gives a nasty warning and disables the buffer. By adding the reader lock around the check, this keeps the race from happening. Cc: stable@vger.kernel.org # 3.6 Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ec01803e0a55..4cb5e5147165 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3783,12 +3783,17 @@ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; /* * Ring buffer is disabled from recording, here's a good place - * to check the integrity of the ring buffer. + * to check the integrity of the ring buffer. + * Must prevent readers from trying to read, as the check + * clears the HEAD page and readers require it. */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->buffer->resize_disabled); -- cgit v1.2.3 From 91d1aa43d30505b0b825db8898ffc80a8eca96c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 27 Nov 2012 19:33:25 +0100 Subject: context_tracking: New context tracking susbsystem Create a new subsystem that probes on kernel boundaries to keep track of the transitions between level contexts with two basic initial contexts: user or kernel. This is an abstraction of some RCU code that use such tracking to implement its userspace extended quiescent state. We need to pull this up from RCU into this new level of indirection because this tracking is also going to be used to implement an "on demand" generic virtual cputime accounting. A necessary step to shutdown the tick while still accounting the cputime. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Li Zhong Cc: Gilad Ben-Yossef Reviewed-by: Steven Rostedt [ paulmck: fix whitespace error and email address. ] Signed-off-by: Paul E. McKenney --- kernel/Makefile | 1 + kernel/context_tracking.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree.c | 64 ++---------------------------------- kernel/sched/core.c | 11 ++++--- 4 files changed, 92 insertions(+), 67 deletions(-) create mode 100644 kernel/context_tracking.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324ee..f90bbfc9727f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 000000000000..e0e07fd55508 --- /dev/null +++ b/kernel/context_tracking.c @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include + +struct context_tracking { + /* + * When active is false, hooks are not set to + * minimize overhead: TIF flags are cleared + * and calls to user_enter/exit are ignored. This + * may be further optimized using static keys. + */ + bool active; + enum { + IN_KERNEL = 0, + IN_USER, + } state; +}; + +static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_FORCE + .active = true, +#endif +}; + +void user_enter(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.active) && + __this_cpu_read(context_tracking.state) != IN_USER) { + __this_cpu_write(context_tracking.state, IN_USER); + rcu_user_enter(); + } + local_irq_restore(flags); +} + +void user_exit(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.state) == IN_USER) { + __this_cpu_write(context_tracking.state, IN_KERNEL); + rcu_user_exit(); + } + local_irq_restore(flags); +} + +void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (__this_cpu_read(context_tracking.active)) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7733eb56e156..e441b77b614e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -207,9 +207,6 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) - .ignore_user_qs = true, -#endif }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -420,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - WARN_ON_ONCE(!current->mm); - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs && !rdtp->in_user) { - rdtp->in_user = true; - rcu_eqs_enter(true); - } - local_irq_restore(flags); + rcu_eqs_enter(1); } /** @@ -579,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->in_user) { - rdtp->in_user = false; - rcu_eqs_exit(true); - } - local_irq_restore(flags); + rcu_eqs_exit(1); } /** @@ -722,21 +677,6 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_RCU_USER_QS -void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) -{ - struct rcu_dynticks *rdtp; - - /* Interrupts are disabled in context switch */ - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } -} -#endif /* #ifdef CONFIG_RCU_USER_QS */ - #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36f260864f65..80f80dfca70e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -1886,8 +1887,8 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ - rcu_user_hooks_switch(prev, next); switch_to(prev, next, prev); barrier(); @@ -2911,7 +2912,7 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING asmlinkage void __sched schedule_user(void) { /* @@ -2920,9 +2921,9 @@ asmlinkage void __sched schedule_user(void) * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. */ - rcu_user_exit(); + user_exit(); schedule(); - rcu_user_enter(); + user_enter(); } #endif @@ -3027,7 +3028,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - rcu_user_exit(); + user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); -- cgit v1.2.3 From 879a3d9dbbde823ac77d39131e7a287f31b8296f Mon Sep 17 00:00:00 2001 From: Gao feng Date: Sat, 1 Dec 2012 00:21:28 +0800 Subject: cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup_clear_directory() incorrectly invokes cgroup_rm_file() on each cftset of the target subsystems, which only removes the first file of each set. This leaves dangling files after subsystems are removed from a cgroup root via remount. Use cgroup_addrm_files() to remove all files of target subsystems. tj: Move cgroup_addrm_files() prototype decl upwards next to other global declarations. Commit message updated. Signed-off-by: Gao feng Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b186a7e25b0a..e1293a9189b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -246,6 +246,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); static int need_forkexit_callback __read_mostly; static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + struct cftype cfts[], bool is_add); #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) @@ -964,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, if (!test_bit(ss->subsys_id, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_rm_file(cgrp, set->cfts); + cgroup_addrm_files(cgrp, NULL, set->cfts, false); } if (base_files) { while (!list_empty(&cgrp->files)) -- cgit v1.2.3 From 412d32e6c98527078779e5b515823b2810e40324 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Nov 2012 07:17:18 +0100 Subject: workqueue: exit rescuer_thread() as TASK_RUNNING A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling off, never to be seen again. In the case where this occurred, an exiting thread hit reiserfs homebrew conditional resched while holding a mutex, bringing the box to its knees. PID: 18105 TASK: ffff8807fd412180 CPU: 5 COMMAND: "kdmflush" #0 [ffff8808157e7670] schedule at ffffffff8143f489 #1 [ffff8808157e77b8] reiserfs_get_block at ffffffffa038ab2d [reiserfs] #2 [ffff8808157e79a8] __block_write_begin at ffffffff8117fb14 #3 [ffff8808157e7a98] reiserfs_write_begin at ffffffffa0388695 [reiserfs] #4 [ffff8808157e7ad8] generic_perform_write at ffffffff810ee9e2 #5 [ffff8808157e7b58] generic_file_buffered_write at ffffffff810eeb41 #6 [ffff8808157e7ba8] __generic_file_aio_write at ffffffff810f1a3a #7 [ffff8808157e7c58] generic_file_aio_write at ffffffff810f1c88 #8 [ffff8808157e7cc8] do_sync_write at ffffffff8114f850 #9 [ffff8808157e7dd8] do_acct_process at ffffffff810a268f [exception RIP: kernel_thread_helper] RIP: ffffffff8144a5c0 RSP: ffff8808157e7f58 RFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8107af60 RDI: ffff8803ee491d18 RBP: 0000000000000000 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 Signed-off-by: Mike Galbraith Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 042d221d33cc..ac25db111f48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2407,8 +2407,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded -- cgit v1.2.3 From 8852aac25e79e38cc6529f20298eed154f60b574 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 Dec 2012 16:23:42 -0800 Subject: workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay 8376fe22c7 ("workqueue: implement mod_delayed_work[_on]()") implemented mod_delayed_work[_on]() using the improved try_to_grab_pending(). The function is later used, among others, to replace [__]candel_delayed_work() + queue_delayed_work() combinations. Unfortunately, a delayed_work item w/ zero @delay is handled slightly differently by mod_delayed_work_on() compared to queue_delayed_work_on(). The latter skips timer altogether and directly queues it using queue_work_on() while the former schedules timer which will expire on the closest tick. This means, when @delay is zero, that [__]cancel_delayed_work() + queue_delayed_work_on() makes the target item immediately executable while mod_delayed_work_on() may induce delay of upto a full tick. This somewhat subtle difference breaks some of the converted users. e.g. block queue plugging uses delayed_work for deferred processing and uses mod_delayed_work_on() when the queue needs to be immediately unplugged. The above problem manifested as noticeably higher number of context switches under certain circumstances. The difference in behavior was caused by missing special case handling for 0 delay in mod_delayed_work_on() compared to queue_delayed_work_on(). Joonsoo Kim posted a patch to add it - ("workqueue: optimize mod_delayed_work_on() when @delay == 0")[1]. The patch was queued for 3.8 but it was described as optimization and I missed that it was a correctness issue. As both queue_delayed_work_on() and mod_delayed_work_on() use __queue_delayed_work() for queueing, it seems that the better approach is to move the 0 delay special handling to the function instead of duplicating it in mod_delayed_work_on(). Fix the problem by moving 0 delay special case handling from queue_delayed_work_on() to __queue_delayed_work(). This replaces Joonsoo's patch. [1] http://thread.gmane.org/gmane.linux.kernel/1379011/focus=1379012 Signed-off-by: Tejun Heo Reported-and-tested-by: Anders Kaseorg Reported-and-tested-by: Zlatko Calusic LKML-Reference: LKML-Reference: <50A78AA9.5040904@iskon.hr> Cc: Joonsoo Kim --- kernel/workqueue.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ac25db111f48..084aa47bac82 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1364,6 +1364,17 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + __queue_work(cpu, wq, &dwork->work); + return; + } + timer_stats_timer_set_start_info(&dwork->timer); /* @@ -1417,9 +1428,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, bool ret = false; unsigned long flags; - if (!delay) - return queue_work_on(cpu, wq, &dwork->work); - /* read the comment in __queue_work() */ local_irq_save(flags); -- cgit v1.2.3 From 999767beb1b4a10eabf90e6017e496536cf4db0b Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Sun, 21 Oct 2012 01:30:06 +0900 Subject: workqueue: trivial fix for return statement in work_busy() Return type of work_busy() is unsigned int. There is return statement returning boolean value, 'false' in work_busy(). It is not problem, because 'false' may be treated '0'. However, fixing it would make code robust. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 084aa47bac82..26f5d16aef65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3485,7 +3485,7 @@ unsigned int work_busy(struct work_struct *work) unsigned int ret = 0; if (!gcwq) - return false; + return 0; spin_lock_irqsave(&gcwq->lock, flags); -- cgit v1.2.3 From 3657600040a7279e52252af3f9d7e253f4f49ef0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 26 Oct 2012 23:03:49 +0900 Subject: workqueue: add WARN_ON_ONCE() on CPU number to wq_worker_waking_up() Recently, workqueue code has gone through some changes and we found some bugs related to concurrency management operations happening on the wrong CPU. When a worker is concurrency managed (!WORKER_NOT_RUNNIG), it should be bound to its associated cpu and woken up to that cpu. Add WARN_ON_ONCE() to verify this. Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 26f5d16aef65..ae9a05603e01 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -739,8 +739,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) { + WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); atomic_inc(get_pool_nr_running(worker->pool)); + } } /** -- cgit v1.2.3 From 84ecfd15f5547c992c901df6ec14b4d507eb2c6e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 23 Nov 2012 12:08:16 +0000 Subject: modsign: add symbol prefix to certificate list Add the arch symbol prefix (if applicable) to the asm definition of modsign_certificate_list and modsign_certificate_list_end. This uses the recently defined SYMBOL_PREFIX which is derived from CONFIG_SYMBOL_PREFIX. This fixes the build of module signing on the blackfin and metag architectures. Signed-off-by: James Hogan Cc: Rusty Russell Cc: David Howells Cc: Mike Frysinger Signed-off-by: Rusty Russell --- kernel/modsign_pubkey.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 4646eb2c3820..767e559dfb10 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -21,10 +21,10 @@ struct key *modsign_keyring; extern __initdata const u8 modsign_certificate_list[]; extern __initdata const u8 modsign_certificate_list_end[]; asm(".section .init.data,\"aw\"\n" - "modsign_certificate_list:\n" + SYMBOL_PREFIX "modsign_certificate_list:\n" ".incbin \"signing_key.x509\"\n" ".incbin \"extra_certificates\"\n" - "modsign_certificate_list_end:" + SYMBOL_PREFIX "modsign_certificate_list_end:" ); /* -- cgit v1.2.3 From 7083d0378a1746f2b45729cae494c6b92e75d73f Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 3 Dec 2012 09:28:18 +0800 Subject: cgroup: remove subsystem files when remounting cgroup cgroup_clear_directroy is called by cgroup_d_remove_dir and cgroup_remount. when we call cgroup_remount to remount the cgroup,the subsystem may be unlinked from cgroupfs_root->subsys_list in rebind_subsystem,this subsystem's files will not be removed in cgroup_clear_directroy. And the system will panic when we try to access these files. this patch removes subsystems's files before rebind_subsystems, if rebind_subsystems failed, repopulate these removed files. With help from Tejun. Signed-off-by: Gao feng Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e1293a9189b6..5cc37241a6fb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1349,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* + * Clear out the files of subsystems that should be removed, do + * this before rebind_subsystems, since rebind_subsystems may + * change this hierarchy's subsys_list. + */ + cgroup_clear_directory(cgrp->dentry, false, removed_mask); + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { + /* rebind_subsystems failed, re-populate the removed files */ + cgroup_populate_dir(cgrp, false, removed_mask); drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); /* re-populate subsystem files */ cgroup_populate_dir(cgrp, false, added_mask); -- cgit v1.2.3 From fd8ef11730f1d03d5d6555aa53126e9e34f52f12 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 3 Dec 2012 06:25:25 +0100 Subject: Revert "sched, autogroup: Stop going ahead if autogroup is disabled" This reverts commit 800d4d30c8f20bd728e5741a3b77c4859a613f7c. Between commits 8323f26ce342 ("sched: Fix race in task_group()") and 800d4d30c8f2 ("sched, autogroup: Stop going ahead if autogroup is disabled"), autogroup is a wreck. With both applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to commit 800d4d30c8f2 not allowing autogroup to move things, and commit 8323f26ce342 making that the only way to switch runqueues: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] effective_load.isra.43+0x50/0x90 Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502 RIP: effective_load.isra.43+0x50/0x90 Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0) Call Trace: select_task_rq_fair+0x255/0x780 try_to_wake_up+0x156/0x2c0 wake_up_state+0xb/0x10 signal_wake_up+0x28/0x40 complete_signal+0x1d6/0x250 __send_signal+0x170/0x310 send_signal+0x40/0x80 do_send_sig_info+0x47/0x90 group_send_sig_info+0x4a/0x70 kill_pid_info+0x3a/0x60 sys_kill+0x97/0x1a0 ? vfs_read+0x120/0x160 ? sys_read+0x45/0x90 system_call_fastpath+0x16/0x1b Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c RIP [] effective_load.isra.43+0x50/0x90 RSP CR2: 0000000000000000 Signed-off-by: Mike Galbraith Acked-by: Ingo Molnar Cc: Yong Zhang Cc: Peter Zijlstra Cc: stable@vger.kernel.org # 2.6.39+ Signed-off-by: Linus Torvalds --- kernel/sched/auto_group.c | 4 ---- kernel/sched/auto_group.h | 5 ----- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..15f60d01198b 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -143,15 +143,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..443232ebbb53 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,11 +4,6 @@ #include struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; -- cgit v1.2.3 From fc4b514f2727f74a4587c31db87e0e93465518c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Dec 2012 07:40:39 -0800 Subject: workqueue: convert BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s 8852aac25e ("workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay") unexpectedly uncovered a very nasty abuse of delayed_work in megaraid - it allocated work_struct, casted it to delayed_work and then pass that into queue_delayed_work(). Previously, this was okay because 0 @delay short-circuited to queue_work() before doing anything with delayed_work. 8852aac25e moved 0 @delay test into __queue_delayed_work() after sanity check on delayed_work making megaraid trigger BUG_ON(). Although megaraid is already fixed by c1d390d8e6 ("megaraid: fix BUG_ON() from incorrect use of delayed work"), this patch converts BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s so that such abusers, if there are more, trigger warning but don't crash the machine. Signed-off-by: Tejun Heo Cc: Xiaotian Feng --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 084aa47bac82..1dae900df798 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1361,8 +1361,8 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); /* * If @delay is 0, queue @dwork->work immediately. This is for -- cgit v1.2.3 From 8d4516904b39507458bee8115793528e12b1d8dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Dec 2012 18:59:34 +0100 Subject: watchdog: Fix CPU hotplug regression Norbert reported: "3.7-rc6 booted with nmi_watchdog=0 fails to suspend to RAM or offline CPUs. It's reproducable with a KVM guest and physical system." The reason is that commit bcd951cf(watchdog: Use hotplug thread infrastructure) missed to take this into account. So the cpu offline code gets stuck in the teardown function because it accesses non initialized data structures. Add a check for watchdog_enabled into that path to cure the issue. Reported-and-tested-by: Norbert Warmuth Tested-by: Joseph Salisbury Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1211231033230.2701@ionos Link: http://bugs.launchpad.net/bugs/1079534 Signed-off-by: Thomas Gleixner --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dd4b80a9f1a9..c8c21be11ab4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -368,6 +368,9 @@ static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + if (!watchdog_enabled) + return; + watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ -- cgit v1.2.3 From 12e130b04580532aa099893158aa2776b321ae7f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Oct 2012 15:05:48 +0100 Subject: MODSIGN: Don't use enum-type bitfields in module signature info block Don't use enum-type bitfields in the module signature info block as we can't be certain how the compiler will handle them. As I understand it, it is arch dependent, and it is possible for the compiler to rearrange them based on endianness and to insert a byte of padding to pad the three enums out to four bytes. Instead use u8 fields for these, which the compiler should emit in the right order without padding. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- kernel/module_signing.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index ea1b1df5dbb0..f2970bddc5ea 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -27,13 +27,13 @@ * - Information block */ struct module_signature { - enum pkey_algo algo : 8; /* Public-key crypto algorithm */ - enum pkey_hash_algo hash : 8; /* Digest algorithm */ - enum pkey_id_type id_type : 8; /* Key identifier type */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ - u8 __pad[3]; - __be32 sig_len; /* Length of signature data */ + u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ + u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 id_type; /* Key identifier type [enum pkey_id_type] */ + u8 signer_len; /* Length of signer's name */ + u8 key_id_len; /* Length of key identifier */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ }; /* -- cgit v1.2.3 From f0fcf2002bf122afe8fe1b74b2cee3710c7e6cd9 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Thu, 6 Dec 2012 17:16:23 +0800 Subject: padata: use __this_cpu_read per-cpu helper For bottom halves off, __this_cpu_read is better. Signed-off-by: Shan Wei Reviewed-by: Christoph Lameter Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 89fe3d1b9efb..072f4ee4eb89 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; unsigned int next_nr, next_index; - struct padata_parallel_queue *queue, *next_queue; + struct padata_parallel_queue *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) goto out; } - queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); - if (queue->cpu_index == next_queue->cpu_index) { + if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; } -- cgit v1.2.3 From f33fddc2b9573d8359f1007d4bbe5cd587a0c093 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 6 Dec 2012 14:38:57 +0800 Subject: cgroup_rm_file: don't delete the uncreated files in cgroup_add_file,when creating files for cgroup, some of creation may be skipped. So we need to avoid deleting these uncreated files in cgroup_rm_file, otherwise the warning msg will be triggered. "cgroup_addrm_files: failed to remove memory_pressure_enabled, err=-2" Signed-off-by: Gao feng Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cc37241a6fb..f34c41bfaa37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2724,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, simple_xattrs_init(&cft->xattrs); - /* does @cft->flags tell us to skip creation on @cgrp? */ - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) - return 0; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) - return 0; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); @@ -2770,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + continue; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + continue; + if (is_add) err = cgroup_add_file(cgrp, subsys, cft); else -- cgit v1.2.3 From c1ad41f1f7270c1956da13fa8fd59d8d5929d56e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Dec 2012 10:23:45 +0100 Subject: Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" This reverts commit 5258f386ea4e8454bc801fb443e8a4217da1947c, because the underlying autogroups bug got fixed upstream in a better way, via: fd8ef11730f1 Revert "sched, autogroup: Stop going ahead if autogroup is disabled" Cc: Mike Galbraith Cc: Yong Zhang Cc: Peter Zijlstra Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 68 +++++++++++++++++++++++++++++++++++++++-------- kernel/sched/auto_group.h | 9 ++++++- kernel/sysctl.c | 6 +++-- 3 files changed, 69 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0f1bacb005a4..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -110,9 +110,6 @@ out_fail: bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { - if (!sysctl_sched_autogroup_enabled) - return false; - if (tg != &root_task_group) return false; @@ -146,11 +143,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -158,11 +159,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) /* Allocates GFP_KERNEL, cannot be called under any spinlock */ void sched_autogroup_create_attach(struct task_struct *p) { - struct autogroup *ag; + struct autogroup *ag = autogroup_create(); - if (!sysctl_sched_autogroup_enabled) - return; - ag = autogroup_create(); autogroup_move_group(p, ag); /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); @@ -178,15 +176,11 @@ EXPORT_SYMBOL(sched_autogroup_detach); void sched_autogroup_fork(struct signal_struct *sig) { - if (!sysctl_sched_autogroup_enabled) - return; sig->autogroup = autogroup_task_get(current); } void sched_autogroup_exit(struct signal_struct *sig) { - if (!sysctl_sched_autogroup_enabled) - return; autogroup_kref_put(sig->autogroup); } @@ -199,6 +193,58 @@ static int __init setup_autogroup(char *str) __setup("noautogroup", setup_autogroup); +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (nice < -20 || nice > 19) + return -EINVAL; + + err = security_task_setnice(current, nice); + if (err) + return err; + + if (nice < 0 && !can_nice(current, nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + if (!err) + ag->nice = nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + if (!task_group_is_autogroup(ag->tg)) + goto out; + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + +out: + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + #ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 4552c6bf79d2..8bd047142816 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,6 +4,11 @@ #include struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; @@ -24,7 +29,9 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - if (task_wants_autogroup(p, tg)) + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; return tg; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0fa5ad09873..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,8 +367,10 @@ static struct ctl_table kern_table[] = { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3 From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- kernel/sched/core.c | 13 +++++ kernel/sched/fair.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 7 +++ kernel/sched/sched.h | 6 +++ kernel/sysctl.c | 24 +++++++++- 5 files changed, 173 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include @@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms: 5s + */ +unsigned int sysctl_numa_balancing_scan_period_min = 5000; +unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; + +static void task_numa_placement(struct task_struct *p) +{ + int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages) +{ + struct task_struct *p = current; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + task_numa_placement(p); +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + ACCESS_ONCE(mm->numa_scan_seq)++; + { + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + change_prot_numa(vma, vma->vm_start, vma->vm_end); + } + up_read(&mm->mmap_sem); + } +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, true) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#else +#define sched_feat_numa(x) (0) +#endif + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, -- cgit v1.2.3 From 6e5fb223e89dbe5cb5c563f8d4a4a0a7d62455a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:45 +0200 Subject: mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate Previously, to probe the working set of a task, we'd use a very simple and crude method: mark all of its address space PROT_NONE. That method has various (obvious) disadvantages: - it samples the working set at dissimilar rates, giving some tasks a sampling quality advantage over others. - creates performance problems for tasks with very large working sets - over-samples processes with large address spaces but which only very rarely execute Improve that method by keeping a rotating offset into the address space that marks the current position of the scan, and advance it by a constant rate (in a CPU cycles execution proportional manner). If the offset reaches the last mapped address of the mm then it then it starts over at the first address. The per-task nature of the working set sampling functionality in this tree allows such constant rate, per task, execution-weight proportional sampling of the working set, with an adaptive sampling interval/frequency that goes from once per 100ms up to just once per 8 seconds. The current sampling volume is 256 MB per interval. As tasks mature and converge their working set, so does the sampling rate slow down to just a trickle, 256 MB per 8 seconds of CPU time executed. This, beyond being adaptive, also rate-limits rarely executing systems and does not over-sample on overloaded systems. [ In AutoNUMA speak, this patch deals with the effective sampling rate of the 'hinting page fault'. AutoNUMA's scanning is currently rate-limited, but it is also fundamentally single-threaded, executing in the knuma_scand kernel thread, so the limit in AutoNUMA is global and does not scale up with the number of CPUs, nor does it scan tasks in an execution proportional manner. So the idea of rate-limiting the scanning was first implemented in the AutoNUMA tree via a global rate limit. This patch goes beyond that by implementing an execution rate proportional working set sampling rate that is not implemented via a single global scanning daemon. ] [ Dan Carpenter pointed out a possible NULL pointer dereference in the first version of this patch. ] Based-on-idea-by: Andrea Arcangeli Bug-Found-By: Dan Carpenter Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote changelog and fixed bug. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- kernel/sched/fair.c | 65 ++++++++++++++++++++++++++++++++++++++++++----------- kernel/sysctl.c | 7 ++++++ 2 files changed, 59 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6831abb5dbef..0a349dd1fa60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -780,10 +780,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms: 5s + * numa task sample period in ms */ -unsigned int sysctl_numa_balancing_scan_period_min = 5000; -unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*16; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; static void task_numa_placement(struct task_struct *p) { @@ -808,6 +811,12 @@ void task_numa_fault(int node, int pages) task_numa_placement(p); } +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -817,6 +826,9 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long offset, end; + long length; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -846,18 +858,45 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; - ACCESS_ONCE(mm->numa_scan_seq)++; - { - struct vm_area_struct *vma; + offset = mm->numa_scan_offset; + length = sysctl_numa_balancing_scan_size; + length <<= 20; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) - continue; - change_prot_numa(vma, vma->vm_start, vma->vm_end); - } - up_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); + vma = find_vma(mm, offset); + if (!vma) { + reset_ptenuma_scan(p); + offset = 0; + vma = mm->mmap; + } + for (; vma && length > 0; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) + continue; + + offset = max(offset, vma->vm_start); + end = min(ALIGN(offset + length, HPAGE_SIZE), vma->vm_end); + length -= end - offset; + + change_prot_numa(vma, offset, end); + + offset = end; } + + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = offset; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 025e1ae50ef1..7d3a2e0475e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3 From 9f40604cdab935e80db57b309c48659de349d4e6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 14 Nov 2012 18:34:32 +0000 Subject: sched, numa, mm: Count WS scanning against present PTEs, not virtual memory ranges By accounting against the present PTEs, scanning speed reflects the actual present (mapped) memory. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- kernel/sched/fair.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0a349dd1fa60..f6e1f25ed2bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -827,8 +827,8 @@ void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; struct vm_area_struct *vma; - unsigned long offset, end; - long length; + unsigned long start, end; + long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -858,18 +858,20 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; - offset = mm->numa_scan_offset; - length = sysctl_numa_balancing_scan_size; - length <<= 20; + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; down_read(&mm->mmap_sem); - vma = find_vma(mm, offset); + vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); - offset = 0; + start = 0; vma = mm->mmap; } - for (; vma && length > 0; vma = vma->vm_next) { + for (; vma; vma = vma->vm_next) { if (!vma_migratable(vma)) continue; @@ -877,15 +879,19 @@ void task_numa_work(struct callback_head *work) if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) continue; - offset = max(offset, vma->vm_start); - end = min(ALIGN(offset + length, HPAGE_SIZE), vma->vm_end); - length -= end - offset; - - change_prot_numa(vma, offset, end); + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + pages -= change_prot_numa(vma, start, end); - offset = end; + start = end; + if (pages <= 0) + goto out; + } while (end != vma->vm_end); } +out: /* * It is possible to reach the end of the VMA list but the last few VMAs are * not guaranteed to the vma_migratable. If they are not, we would find the @@ -893,7 +899,7 @@ void task_numa_work(struct callback_head *work) * so check it now. */ if (vma) - mm->numa_scan_offset = offset; + mm->numa_scan_offset = start; else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); -- cgit v1.2.3 From 4b96a29ba891dd59734cb7be80a900fe93aa2d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:47 +0200 Subject: mm: sched: numa: Implement slow start for working set sampling Add a 1 second delay before starting to scan the working set of a task and starting to balance it amongst nodes. [ note that before the constant per task WSS sampling rate patch the initial scan would happen much later still, in effect that patch caused this regression. ] The theory is that short-run tasks benefit very little from NUMA placement: they come and go, and they better stick to the node they were started on. As tasks mature and rebalance to other CPUs and nodes, so does their NUMA placement have to change and so does it start to matter more and more. In practice this change fixes an observable kbuild regression: # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ] !NUMA: 45.291088843 seconds time elapsed ( +- 0.40% ) 45.154231752 seconds time elapsed ( +- 0.36% ) +NUMA, no slow start: 46.172308123 seconds time elapsed ( +- 0.30% ) 46.343168745 seconds time elapsed ( +- 0.25% ) +NUMA, 1 sec slow start: 45.224189155 seconds time elapsed ( +- 0.25% ) 45.160866532 seconds time elapsed ( +- 0.17% ) and it also fixes an observable perf bench (hackbench) regression: # perf stat --null --repeat 10 perf bench sched messaging -NUMA: -NUMA: 0.246225691 seconds time elapsed ( +- 1.31% ) +NUMA no slow start: 0.252620063 seconds time elapsed ( +- 1.13% ) +NUMA 1sec delay: 0.248076230 seconds time elapsed ( +- 1.35% ) The implementation is simple and straightforward, most of the patch deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable knob. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote the changelog, ran measurements, tuned the default. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 5 +++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cad0d092ce3b..fbfc4843063f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1543,7 +1543,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6e1f25ed2bd..7727b0161579 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -788,6 +788,9 @@ unsigned int sysctl_numa_balancing_scan_period_max = 100*16; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + static void task_numa_placement(struct task_struct *p) { int seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -929,6 +932,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; curr->node_stamp = now; if (!time_before(jiffies, curr->mm->numa_next_scan)) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d3a2e0475e5..48a68cc258c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -352,6 +352,13 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_min_ms", .data = &sysctl_numa_balancing_scan_period_min, -- cgit v1.2.3 From e14808b49f55e0e1135da5e4a154a540dd9f3662 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 10:59:15 +0000 Subject: mm: numa: Rate limit setting of pte_numa if node is saturated If there are a large number of NUMA hinting faults and all of them are resulting in migrations it may indicate that memory is just bouncing uselessly around. NUMA balancing cost is likely exceeding any benefit from locality. Rate limit the PTE updates if the node is migration rate-limited. As noted in the comments, this distorts the NUMA faulting statistics. Signed-off-by: Mel Gorman --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7727b0161579..37e895a941ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -861,6 +862,14 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ -- cgit v1.2.3 From fb003b80daa0dead5b87f4e2e4fb8da68b110ff2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 15 Nov 2012 09:01:14 +0000 Subject: sched: numa: Slowly increase the scanning period as NUMA faults are handled Currently the rate of scanning for an address space is controlled by the individual tasks. The next scan is simply determined by 2*p->numa_scan_period. The 2*p->numa_scan_period is arbitrary and never changes. At this point there is still no proper policy that decides if a task or process is properly placed. It just scans and assumes the next NUMA fault will place it properly. As it is assumed that pages will get properly placed over time, increase the scan window each time a fault is incurred. This is a big assumption as noted in the comments. It should be noted that changing to p->numa_scan_period will increase system CPU usage because now the scanning rate has effectively doubled. If that is a problem then the min_rate should be made 200ms instead of restoring the 2* logic. Signed-off-by: Mel Gorman --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 37e895a941ab..dd18087fd369 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -812,6 +812,15 @@ void task_numa_fault(int node, int pages) /* FIXME: Allocate task-specific structure for placement policy here */ + /* + * Assume that as faults occur that pages are getting properly placed + * and fewer NUMA hints are required. Note that this is a big + * assumption, it assumes processes reach a steady steady with no + * further phase changes. + */ + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(2)); + task_numa_placement(p); } @@ -858,7 +867,7 @@ void task_numa_work(struct callback_head *work) if (p->numa_scan_period == 0) p->numa_scan_period = sysctl_numa_balancing_scan_period_min; - next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period); + next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; -- cgit v1.2.3 From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 29 +++++++++++++++++++++-------- kernel/sysctl.c | 7 +++++++ 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbfc4843063f..9d255bc0e278 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd18087fd369..4b577863933f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * numa task sample period in ms */ unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*16; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; /* FIXME: Allocate task-specific structure for placement policy here */ /* - * Assume that as faults occur that pages are getting properly placed - * and fewer NUMA hints are required. Note that this is a big - * assumption, it assumes processes reach a steady steady with no - * further phase changes. + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(2)); + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); } @@ -857,6 +857,19 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + /* * Enforce maximal scan/migration frequency.. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48a68cc258c1..8906f90d6fa2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, -- cgit v1.2.3 From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 11:16:36 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing This patch adds Kconfig options and kernel parameters to allow the enabling and disabling of automatic NUMA balancing. The existance of such a switch was and is very important when debugging problems related to transparent hugepages and we should have the same for automatic NUMA placement. Signed-off-by: Mel Gorman --- kernel/sched/core.c | 48 +++++++++++++++++++++++++++++++++--------------- kernel/sched/fair.c | 3 +++ kernel/sched/features.h | 6 ++++-- 3 files changed, 40 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d255bc0e278..7a45015274ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -192,23 +192,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -228,6 +215,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1549,6 +1557,16 @@ static void __sched_fork(struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +#ifdef CONFIG_NUMA_BALANCING +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b577863933f..7a02a2082e95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -811,6 +811,9 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; + if (!sched_feat_numa(NUMA)) + return; + /* FIXME: Allocate task-specific structure for placement policy here */ /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5fb7aefbec80..d2373a3e3252 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,8 +63,10 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) /* - * Apply the automatic NUMA scheduling policy + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, true) +SCHED_FEAT(NUMA, false) #endif -- cgit v1.2.3 From 3105b86a9fee7d2c2e76edb53bbbc4027599628f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 23 Nov 2012 11:23:49 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG The "mm: sched: numa: Control enabling and disabling of NUMA balancing" depends on scheduling debug being enabled but it's perfectly legimate to disable automatic NUMA balancing even without this option. This should take care of it. Signed-off-by: Mel Gorman --- kernel/sched/core.c | 9 +++++++++ kernel/sched/sched.h | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a45015274ab..2a46f4407414 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1558,6 +1558,7 @@ static void __sched_fork(struct task_struct *p) } #ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG void set_numabalancing_state(bool enabled) { if (enabled) @@ -1565,6 +1566,14 @@ void set_numabalancing_state(bool enabled) else sched_feat_set("NO_NUMA"); } +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; +} +#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_NUMA_BALANCING */ /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae31c051ff2f..4f93d031875a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -650,9 +650,15 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #ifdef CONFIG_NUMA_BALANCING #define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ #else #define sched_feat_numa(x) (0) -#endif +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ static inline u64 global_rt_period(void) { -- cgit v1.2.3 From 5bca23035391928c4c7301835accca3551b96cc2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 14:40:03 +0000 Subject: mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node Due to the fact that migrations are driven by the CPU a task is running on there is no point tracking NUMA faults until one task runs on a new node. This patch tracks the first node used by an address space. Until it changes, PTE scanning is disabled and no NUMA hinting faults are trapped. This should help workloads that are short-lived, do not care about NUMA placement or have bound themselves to a single node. This takes advantage of the logic in "mm: sched: numa: Implement slow start for working set sampling" to delay when the checks are made. This will take advantage of processes that set their CPU and node bindings early in their lifetime. It will also potentially allow any initial load balancing to take place. Signed-off-by: Mel Gorman --- kernel/fork.c | 3 +++ kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/features.h | 4 +++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..296ea308096d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,6 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a02a2082e95..3e18f611a5aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -860,6 +860,24 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + /* * Reset the scan period if enough time has gone by. Objective is that * scanning will be reduced if pages are properly placed. As tasks diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d2373a3e3252..e7c25fff1e94 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) #endif -- cgit v1.2.3 From e2a29943e9a2ee2aa737a77f550f46ba72269db4 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Tue, 14 Jun 2011 17:29:51 +0200 Subject: fsnotify: pass group to fsnotify_destroy_mark() In fsnotify_destroy_mark() dont get the group from the passed mark anymore, but pass the group itself as an additional parameter to the function. Signed-off-by: Lino Sanfilippo Signed-off-by: Eric Paris --- kernel/audit_tree.c | 10 +++++----- kernel/audit_watch.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ed206fd88cca..e81175ef25f8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -249,7 +249,7 @@ static void untag_chunk(struct node *p) list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); goto out; } @@ -291,7 +291,7 @@ static void untag_chunk(struct node *p) owner->root = new; spin_unlock(&hash_lock); spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; @@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); - fsnotify_destroy_mark(entry); + fsnotify_destroy_mark(entry, audit_tree_group); fsnotify_put_mark(entry); return 0; } @@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_destroy_mark(chunk_entry); + fsnotify_destroy_mark(chunk_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -443,7 +443,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_destroy_mark(old_entry); + fsnotify_destroy_mark(old_entry, audit_tree_group); fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b5..a66affc1c12c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } mutex_unlock(&audit_filter_mutex); - fsnotify_destroy_mark(&parent->mark); + fsnotify_destroy_mark(&parent->mark, audit_watch_group); } /* Get path information necessary for adding watches. */ @@ -456,7 +456,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) if (list_empty(&parent->watches)) { audit_get_parent(parent); - fsnotify_destroy_mark(&parent->mark); + fsnotify_destroy_mark(&parent->mark, audit_watch_group); audit_put_parent(parent); } } -- cgit v1.2.3 From 38d7bee9d24adf4c95676a3dc902827c72930ebb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:24 -0800 Subject: cpuset: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b017887d632f..7bb63eea6eb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_HIGH_MEMORY]. + * found any online mems, return node_states[N_MEMORY]. * * One way or another, we guarantee to return some non-empty subset - * of node_states[N_HIGH_MEMORY]. + * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ @@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_HIGH_MEMORY])) + node_states[N_MEMORY])) cs = cs->parent; if (cs) nodes_and(*pmask, cs->mems_allowed, - node_states[N_HIGH_MEMORY]); + node_states[N_MEMORY]); else - *pmask = node_states[N_HIGH_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); + *pmask = node_states[N_MEMORY]; + BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); } /* @@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, return -ENOMEM; /* - * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; + * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; * it's read-only */ if (cs == &top_cpuset) { @@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) { + node_states[N_MEMORY])) { retval = -EINVAL; goto done; } @@ -2026,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) * before dropping down to the next. It always processes a node before * any of its children. * - * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * In the case of memory hot-unplug, it will remove nodes from N_MEMORY * if all present pages from a node are offlined. */ static void @@ -2065,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) /* Continue past cpusets with all mems online */ if (nodes_subset(cp->mems_allowed, - node_states[N_HIGH_MEMORY])) + node_states[N_MEMORY])) continue; oldmems = cp->mems_allowed; @@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) /* Remove offline mems from this cpuset. */ mutex_lock(&callback_mutex); nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_HIGH_MEMORY]); + node_states[N_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ @@ -2126,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) #ifdef CONFIG_MEMORY_HOTPLUG /* - * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. + * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, @@ -2140,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_ONLINE: oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + top_cpuset.mems_allowed = node_states[N_MEMORY]; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; @@ -2169,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + top_cpuset.mems_allowed = node_states[N_MEMORY]; hotplug_memory_notifier(cpuset_track_online_nodes, 10); @@ -2237,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of node_states[N_HIGH_MEMORY], even if this means going outside the + * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ -- cgit v1.2.3 From aee4faa499dc58fdb126885f68d447f2eba79b43 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:39 -0800 Subject: kthread: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 29fb60caecb5..691dc2ef9baf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -428,7 +428,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_states[N_HIGH_MEMORY]); + set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; -- cgit v1.2.3 From 44e33e8f95171b93968c3ac3cf8516998b1db65d Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 12 Dec 2012 13:51:52 -0800 Subject: res_counter: delete res_counter_write() Since commit 628f42355389 ("memcg: limit change shrink usage") both res_counter_write() and write_strategy_fn have been unused. This patch deletes them both. Signed-off-by: Greg Thelen Cc: Glauber Costa Cc: Tejun Heo Acked-by: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369a..3920d593e63c 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf, *res = PAGE_ALIGN(*res); return 0; } - -int res_counter_write(struct res_counter *counter, int member, - const char *buf, write_strategy_fn write_strategy) -{ - char *end; - unsigned long flags; - unsigned long long tmp, *val; - - if (write_strategy) { - if (write_strategy(buf, &tmp)) - return -EINVAL; - } else { - tmp = simple_strtoull(buf, &end, 10); - if (*end != '\0') - return -EINVAL; - } - spin_lock_irqsave(&counter->lock, flags); - val = res_counter_member(counter, member); - *val = tmp; - spin_unlock_irqrestore(&counter->lock, flags); - return 0; -} -- cgit v1.2.3 From 34e1169d996ab148490c01b65b4ee371cf8ffba2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Oct 2012 07:31:07 +1030 Subject: module: add syscall to load module from fd As part of the effort to create a stronger boundary between root and kernel, Chrome OS wants to be able to enforce that kernel modules are being loaded only from our read-only crypto-hash verified (dm_verity) root filesystem. Since the init_module syscall hands the kernel a module as a memory blob, no reasoning about the origin of the blob can be made. Earlier proposals for appending signatures to kernel modules would not be useful in Chrome OS, since it would involve adding an additional set of keys to our kernel and builds for no good reason: we already trust the contents of our root filesystem. We don't need to verify those kernel modules a second time. Having to do signature checking on module loading would slow us down and be redundant. All we need to know is where a module is coming from so we can say yes/no to loading it. If a file descriptor is used as the source of a kernel module, many more things can be reasoned about. In Chrome OS's case, we could enforce that the module lives on the filesystem we expect it to live on. In the case of IMA (or other LSMs), it would be possible, for example, to examine extended attributes that may contain signatures over the contents of the module. This introduces a new syscall (on x86), similar to init_module, that has only two arguments. The first argument is used as a file descriptor to the module and the second argument is a pointer to the NULL terminated string of module arguments. Signed-off-by: Kees Cook Cc: Andrew Morton Signed-off-by: Rusty Russell (merge fixes) --- kernel/module.c | 367 +++++++++++++++++++++++++++++++++----------------------- kernel/sys_ni.c | 1 + 2 files changed, 220 insertions(+), 148 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6e48c3a43599..6d2c4e4ca1f5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -2425,18 +2426,17 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, - const void *mod, unsigned long *_len) +static int module_sig_check(struct load_info *info) { int err = -ENOKEY; - unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - unsigned long len = *_len; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const void *mod = info->hdr; - if (len > markerlen && - memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + if (info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ - *_len -= markerlen; - err = mod_verify_sig(mod, _len); + info->len -= markerlen; + err = mod_verify_sig(mod, &info->len); } if (!err) { @@ -2454,59 +2454,97 @@ static int module_sig_check(struct load_info *info, return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, - void *mod, unsigned long *len) +static int module_sig_check(struct load_info *info) { return 0; } #endif /* !CONFIG_MODULE_SIG */ -/* Sets info->hdr, info->len and info->sig_ok. */ -static int copy_and_check(struct load_info *info, - const void __user *umod, unsigned long len, - const char __user *uargs) +/* Sanity checks against invalid binaries, wrong arch, weird elf version. */ +static int elf_header_check(struct load_info *info) { - int err; - Elf_Ehdr *hdr; + if (info->len < sizeof(*(info->hdr))) + return -ENOEXEC; + + if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0 + || info->hdr->e_type != ET_REL + || !elf_check_arch(info->hdr) + || info->hdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (info->hdr->e_shoff >= info->len + || (info->hdr->e_shnum * sizeof(Elf_Shdr) > + info->len - info->hdr->e_shoff)) + return -ENOEXEC; - if (len < sizeof(*hdr)) + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) +{ + info->len = len; + if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ - if ((hdr = vmalloc(len)) == NULL) + info->hdr = vmalloc(info->len); + if (!info->hdr) return -ENOMEM; - if (copy_from_user(hdr, umod, len) != 0) { - err = -EFAULT; - goto free_hdr; + if (copy_from_user(info->hdr, umod, info->len) != 0) { + vfree(info->hdr); + return -EFAULT; } - err = module_sig_check(info, hdr, &len); + return 0; +} + +/* Sets info->hdr and info->len. */ +static int copy_module_from_fd(int fd, struct load_info *info) +{ + struct file *file; + int err; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + file = fget(fd); + if (!file) + return -ENOEXEC; + + err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); if (err) - goto free_hdr; + goto out; - /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 - || hdr->e_type != ET_REL - || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(Elf_Shdr)) { - err = -ENOEXEC; - goto free_hdr; + if (stat.size > INT_MAX) { + err = -EFBIG; + goto out; } - - if (hdr->e_shoff >= len || - hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { - err = -ENOEXEC; - goto free_hdr; + info->hdr = vmalloc(stat.size); + if (!info->hdr) { + err = -ENOMEM; + goto out; } - info->hdr = hdr; - info->len = len; - return 0; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(info->hdr); + err = bytes; + goto out; + } + if (bytes == 0) + break; + pos += bytes; + } + info->len = pos; -free_hdr: - vfree(hdr); +out: + fput(file); return err; } @@ -2945,33 +2983,123 @@ static bool finished_loading(const char *name) return ret; } +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* This is where the real work happens */ +static int do_init_module(struct module *mod) +{ + int ret = 0; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; + } + if (ret > 0) { + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } + + /* Now it's a first class citizen! */ + mod->state = MODULE_STATE_LIVE; + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_ro_size = 0; + mod->init_text_size = 0; + mutex_unlock(&module_mutex); + wake_up_all(&module_wq); + + return 0; +} + +static int may_init_module(void) +{ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +static int load_module(struct load_info *info, const char __user *uargs) { - struct load_info info = { NULL, }; struct module *mod, *old; long err; - pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); + err = module_sig_check(info); + if (err) + goto free_copy; - /* Copy in the blobs from userspace, check they are vaguely sane. */ - err = copy_and_check(&info, umod, len, uargs); + err = elf_header_check(info); if (err) - return ERR_PTR(err); + goto free_copy; /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(&info); + mod = layout_and_allocate(info); if (IS_ERR(mod)) { err = PTR_ERR(mod); goto free_copy; } #ifdef CONFIG_MODULE_SIG - mod->sig_ok = info.sig_ok; + mod->sig_ok = info->sig_ok; if (!mod->sig_ok) add_taint_module(mod, TAINT_FORCED_MODULE); #endif @@ -2983,25 +3111,25 @@ static struct module *load_module(void __user *umod, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, &info); + find_module_sections(mod, info); err = check_module_license_and_versions(mod); if (err) goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, &info); + setup_modinfo(mod, info); /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(mod, &info); + err = simplify_symbols(mod, info); if (err < 0) goto free_modinfo; - err = apply_relocations(mod, &info); + err = apply_relocations(mod, info); if (err < 0) goto free_modinfo; - err = post_relocation(mod, &info); + err = post_relocation(mod, info); if (err < 0) goto free_modinfo; @@ -3041,14 +3169,14 @@ again: } /* This has to be done once we're sure module name is unique. */ - dynamic_debug_setup(info.debug, info.num_debug); + dynamic_debug_setup(info->debug, info->num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) goto ddebug; - module_bug_finalize(info.hdr, info.sechdrs, mod); + module_bug_finalize(info->hdr, info->sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -3059,16 +3187,17 @@ again: goto unlink; /* Link in to syfs. */ - err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); + err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto unlink; /* Get rid of temporary copy. */ - free_copy(&info); + free_copy(info); /* Done! */ trace_module_load(mod); - return mod; + + return do_init_module(mod); unlink: mutex_lock(&module_mutex); @@ -3077,7 +3206,7 @@ again: module_bug_cleanup(mod); wake_up_all(&module_wq); ddebug: - dynamic_debug_remove(info.debug); + dynamic_debug_remove(info->debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); @@ -3089,106 +3218,48 @@ again: free_unload: module_unload_free(mod); free_module: - module_deallocate(mod, &info); + module_deallocate(mod, info); free_copy: - free_copy(&info); - return ERR_PTR(err); -} - -/* Call module constructors. */ -static void do_mod_ctors(struct module *mod) -{ -#ifdef CONFIG_CONSTRUCTORS - unsigned long i; - - for (i = 0; i < mod->num_ctors; i++) - mod->ctors[i](); -#endif + free_copy(info); + return err; } -/* This is where the real work happens */ SYSCALL_DEFINE3(init_module, void __user *, umod, unsigned long, len, const char __user *, uargs) { - struct module *mod; - int ret = 0; - - /* Must have permission */ - if (!capable(CAP_SYS_MODULE) || modules_disabled) - return -EPERM; + int err; + struct load_info info = { }; - /* Do all the hard work */ - mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) - return PTR_ERR(mod); + err = may_init_module(); + if (err) + return err; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); + pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); + err = copy_module_from_user(umod, len, &info); + if (err) + return err; - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + return load_module(&info, uargs); +} - do_mod_ctors(mod); - /* Start the module */ - if (mod->init != NULL) - ret = do_one_initcall(mod->init); - if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; - } - if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); - dump_stack(); - } +SYSCALL_DEFINE2(finit_module, int, fd, const char __user *, uargs) +{ + int err; + struct load_info info = { }; - /* Now it's a first class citizen! */ - mod->state = MODULE_STATE_LIVE; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_LIVE, mod); + err = may_init_module(); + if (err) + return err; - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + pr_debug("finit_module: fd=%d, uargs=%p\n", fd, uargs); - mutex_lock(&module_mutex); - /* Drop initial reference. */ - module_put(mod); - trim_init_extable(mod); -#ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; -#endif - unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; - mutex_unlock(&module_mutex); - wake_up_all(&module_wq); + err = copy_module_from_fd(fd, &info); + if (err) + return err; - return 0; + return load_module(&info, uargs); } static inline int within(unsigned long addr, void *start, unsigned long size) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index dbff751e4086..395084d4ce16 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); +cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); cond_syscall(sys_bind); -- cgit v1.2.3 From 2f3238aebedb243804f58d62d57244edec4149b2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Oct 2012 18:09:41 +1030 Subject: module: add flags arg to sys_finit_module() Thanks to Michael Kerrisk for keeping us honest. These flags are actually useful for eliminating the only case where kmod has to mangle a module's internals: for overriding module versioning. Signed-off-by: Rusty Russell Acked-by: Lucas De Marchi Acked-by: Kees Cook --- kernel/module.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6d2c4e4ca1f5..1395ca382fb5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "module-internal.h" #define CREATE_TRACE_POINTS @@ -2553,7 +2554,7 @@ static void free_copy(struct load_info *info) vfree(info->hdr); } -static int rewrite_section_headers(struct load_info *info) +static int rewrite_section_headers(struct load_info *info, int flags) { unsigned int i; @@ -2581,7 +2582,10 @@ static int rewrite_section_headers(struct load_info *info) } /* Track but don't keep modinfo and version sections. */ - info->index.vers = find_sec(info, "__versions"); + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.info = find_sec(info, ".modinfo"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -2596,7 +2600,7 @@ static int rewrite_section_headers(struct load_info *info) * Return the temporary module pointer (we'll replace it with the final * one when we move the module sections around). */ -static struct module *setup_load_info(struct load_info *info) +static struct module *setup_load_info(struct load_info *info, int flags) { unsigned int i; int err; @@ -2607,7 +2611,7 @@ static struct module *setup_load_info(struct load_info *info) info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info); + err = rewrite_section_headers(info, flags); if (err) return ERR_PTR(err); @@ -2645,11 +2649,14 @@ static struct module *setup_load_info(struct load_info *info) return mod; } -static int check_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo(struct module *mod, struct load_info *info, int flags) { const char *modmagic = get_modinfo(info, "vermagic"); int err; + if (flags & MODULE_INIT_IGNORE_VERMAGIC) + modmagic = NULL; + /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); @@ -2885,18 +2892,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } -static struct module *layout_and_allocate(struct load_info *info) +static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; Elf_Shdr *pcpusec; int err; - mod = setup_load_info(info); + mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; - err = check_modinfo(mod, info); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -3078,7 +3085,8 @@ static int may_init_module(void) /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static int load_module(struct load_info *info, const char __user *uargs) +static int load_module(struct load_info *info, const char __user *uargs, + int flags) { struct module *mod, *old; long err; @@ -3092,7 +3100,7 @@ static int load_module(struct load_info *info, const char __user *uargs) goto free_copy; /* Figure out module layout, and allocate all the memory. */ - mod = layout_and_allocate(info); + mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { err = PTR_ERR(mod); goto free_copy; @@ -3241,10 +3249,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (err) return err; - return load_module(&info, uargs); + return load_module(&info, uargs, 0); } -SYSCALL_DEFINE2(finit_module, int, fd, const char __user *, uargs) +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { int err; struct load_info info = { }; @@ -3253,13 +3261,17 @@ SYSCALL_DEFINE2(finit_module, int, fd, const char __user *, uargs) if (err) return err; - pr_debug("finit_module: fd=%d, uargs=%p\n", fd, uargs); + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC)) + return -EINVAL; err = copy_module_from_fd(fd, &info); if (err) return err; - return load_module(&info, uargs); + return load_module(&info, uargs, flags); } static inline int within(unsigned long addr, void *start, unsigned long size) -- cgit v1.2.3 From 2e72d51b4ac32989496870cd8171b3682fea1839 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Oct 2012 07:32:07 +1030 Subject: security: introduce kernel_module_from_file hook Now that kernel module origins can be reasoned about, provide a hook to the LSMs to make policy decisions about the module file. This will let Chrome OS enforce that loadable kernel modules can only come from its read-only hash-verified root filesystem. Other LSMs can, for example, read extended attributes for signatures, etc. Signed-off-by: Kees Cook Acked-by: Serge E. Hallyn Acked-by: Eric Paris Acked-by: Mimi Zohar Acked-by: James Morris Signed-off-by: Rusty Russell --- kernel/module.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1395ca382fb5..a1d2ed8bab93 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2485,10 +2486,16 @@ static int elf_header_check(struct load_info *info) static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) { + int err; + info->len = len; if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; + err = security_kernel_module_from_file(NULL); + if (err) + return err; + /* Suck in entire file: we'll want most of it. */ info->hdr = vmalloc(info->len); if (!info->hdr) @@ -2515,6 +2522,10 @@ static int copy_module_from_fd(int fd, struct load_info *info) if (!file) return -ENOEXEC; + err = security_kernel_module_from_file(file); + if (err) + goto out; + err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); if (err) goto out; -- cgit v1.2.3 From 54523ec71f8ce99accae97c74152f14f261f7e18 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Wed, 5 Dec 2012 12:29:04 +1030 Subject: module: Remove a extra null character at the top of module->strtab. There is a extra null character('\0') at the top of module->strtab for each module. Commit 59ef28b introduced this bug and this patch fixes it. Live dump log of the current linus git kernel(HEAD is 2844a4870): ============================================================================ crash> mod | grep loop ffffffffa01db0a0 loop 16689 (not loaded) [CONFIG_KALLSYMS] crash> module.core_symtab ffffffffa01db0a0 core_symtab = 0xffffffffa01db320crash> rd 0xffffffffa01db320 12 ffffffffa01db320: 0000005500000001 0000000000000000 ....U........... ffffffffa01db330: 0000000000000000 0002007400000002 ............t... ffffffffa01db340: ffffffffa01d8000 0000000000000038 ........8....... ffffffffa01db350: 001a00640000000e ffffffffa01daeb0 ....d........... ffffffffa01db360: 00000000000000a0 0002007400000019 ............t... ffffffffa01db370: ffffffffa01d8068 000000000000001b h............... crash> module.core_strtab ffffffffa01db0a0 core_strtab = 0xffffffffa01dbb30 "" crash> rd 0xffffffffa01dbb30 4 ffffffffa01dbb30: 615f70616d6b0000 66780063696d6f74 ..kmap_atomic.xf ffffffffa01dbb40: 73636e75665f7265 72665f646e696600 er_funcs.find_fr ============================================================================ We expect Just first one byte of '\0', but actually first two bytes are '\0'. Here is The relationship between symtab and strtab. symtab_idx strtab_idx symbol ----------------------------------------------- 0 0x1 "\0" # startab_idx should be 0 1 0x2 "kmap_atomic" 2 0xe "xfer_funcs" 3 0x19 "find_fr..." By applying this patch, it becomes as follows. symtab_idx strtab_idx symbol ----------------------------------------------- 0 0x0 "\0" # extra byte is removed 1 0x1 "kmap_atomic" 2 0xd "xfer_funcs" 3 0x18 "find_fr..." Signed-off-by: Satoru Takeuchi Cc: Masaki Kimura Cc: Rusty Russell Cc: Greg Kroah-Hartman Signed-off-by: Rusty Russell --- kernel/module.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a1d2ed8bab93..79a526dd1b11 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2285,7 +2285,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *symsect = info->sechdrs + info->index.sym; Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size; + unsigned int i, nsrc, ndst, strtab_size = 0; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; @@ -2296,9 +2296,6 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - /* strtab always starts with a nul, so offset 0 is the empty string. */ - strtab_size = 1; - /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || @@ -2340,7 +2337,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *s++ = 0; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { -- cgit v1.2.3 From 82fab442f5322b016f72891c0db2436c6a6c20b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 11 Dec 2012 09:38:33 +1030 Subject: modules: don't hand 0 to vmalloc. In commit d0a21265dfb5fa8a David Rientjes unified various archs' module_alloc implementation (including x86) and removed the graduitous shortcut for size == 0. Then, in commit de7d2b567d040e3b, Joe Perches added a warning for zero-length vmallocs, which can happen without kallsyms on modules with no init sections (eg. zlib_deflate). Fix this once and for all; the module code has to handle zero length anyway, so get it right at the caller and remove the now-gratuitous checks within the arch-specific module_alloc implementations. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=42608 Reported-by: Conrad Kostecki Cc: David Rientjes Cc: Joe Perches Signed-off-by: Rusty Russell --- kernel/module.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 79a526dd1b11..4542ff3f7ef9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2377,7 +2377,7 @@ static void dynamic_debug_remove(struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return size == 0 ? NULL : vmalloc_exec(size); + return vmalloc_exec(size); } static void *module_alloc_update_bounds(unsigned long size) @@ -2793,20 +2793,23 @@ static int move_module(struct module *mod, struct load_info *info) memset(ptr, 0, mod->core_size); mod->module_core = ptr; - ptr = module_alloc_update_bounds(mod->init_size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. - */ - kmemleak_ignore(ptr); - if (!ptr && mod->init_size) { - module_free(mod, mod->module_core); - return -ENOMEM; - } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + if (mod->init_size) { + ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr) { + module_free(mod, mod->module_core); + return -ENOMEM; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + } else + mod->module_init = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); -- cgit v1.2.3 From 919aa45e43a84d40c27c83f6117cfa6542cee14e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 11 Dec 2012 11:37:13 +1030 Subject: MODSIGN: Avoid using .incbin in C source Using the asm .incbin statement in C sources breaks any gcc wrapper which assumes that preprocessed C source is self-contained. Use a separate .S file to include the siging key and certificate. [ This means we no longer need SYMBOL_PREFIX which is defined in kernel.h from cbdbf2abb7844548a7d7a6a2ae7af6b6fbcea401, so I removed it -- RR ] Tested-by: Michal Marek Signed-off-by: Takashi Iwai Signed-off-by: Rusty Russell Acked-by: James Hogan --- kernel/Makefile | 4 ++-- kernel/modsign_certificate.S | 19 +++++++++++++++++++ kernel/modsign_pubkey.c | 6 ------ 3 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 kernel/modsign_certificate.S (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 86e3285ae7e5..0bd9d436341e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -139,7 +139,7 @@ ifeq ($(CONFIG_MODULE_SIG),y) extra_certificates: touch $@ -kernel/modsign_pubkey.o: signing_key.x509 extra_certificates +kernel/modsign_certificate.o: signing_key.x509 extra_certificates ############################################################################### # diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S new file mode 100644 index 000000000000..246b4c6e6135 --- /dev/null +++ b/kernel/modsign_certificate.S @@ -0,0 +1,19 @@ +/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ +#ifndef SYMBOL_PREFIX +#define ASM_SYMBOL(sym) sym +#else +#define PASTE2(x,y) x##y +#define PASTE(x,y) PASTE2(x,y) +#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) +#endif + +#define GLOBAL(name) \ + .globl ASM_SYMBOL(name); \ + ASM_SYMBOL(name): + + .section ".init.data","aw" + +GLOBAL(modsign_certificate_list) + .incbin "signing_key.x509" + .incbin "extra_certificates" +GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 767e559dfb10..045504fffbb2 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -20,12 +20,6 @@ struct key *modsign_keyring; extern __initdata const u8 modsign_certificate_list[]; extern __initdata const u8 modsign_certificate_list_end[]; -asm(".section .init.data,\"aw\"\n" - SYMBOL_PREFIX "modsign_certificate_list:\n" - ".incbin \"signing_key.x509\"\n" - ".incbin \"extra_certificates\"\n" - SYMBOL_PREFIX "modsign_certificate_list_end:" - ); /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice -- cgit v1.2.3 From e10e1774efbdaec54698454200619a03a01e1d64 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Tue, 11 Dec 2012 12:26:22 +1030 Subject: MODSIGN: Fix kbuild output when using default extra_certificates Reported-by: Peter Foley Signed-off-by: Michal Marek Acked-by: Peter Foley Signed-off-by: Rusty Russell --- kernel/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0bd9d436341e..17f90b69d338 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -136,8 +136,12 @@ ifeq ($(CONFIG_MODULE_SIG),y) # # Pull the signing certificate and any extra certificates into the kernel # + +quiet_cmd_touch = TOUCH $@ + cmd_touch = touch $@ + extra_certificates: - touch $@ + $(call cmd,touch) kernel/modsign_certificate.o: signing_key.x509 extra_certificates -- cgit v1.2.3 From 17bc14b767cf0692420c43dbe5310ae98a5a7836 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 14 Dec 2012 07:20:43 -0800 Subject: Revert "sched: Update_cfs_shares at period edge" This reverts commit f269ae0469fc882332bdfb5db15d3c1315fe2a10. It turns out it causes a very noticeable interactivity regression with CONFIG_SCHED_AUTOGROUP (test-case: "make -j32" of the kernel in a terminal window, while scrolling in a browser - the autogrouping means that the two end up in separate cgroups, and the browser should be smooth as silk despite the high load). Says Paul Turner: "It seems that the update-throttling on the wake-side is reducing the interactive tasks' ability to preempt. While I suspect the right longer term answer here is force these updates only in the cross-cgroup case; this is less trivial. For this release I believe the right answer is either going to be a revert or restore the updates on the enqueue-side." Reported-by: Linus Torvalds Bisected-by: Mike Galbraith Acked-by: Paul Turner Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59e072b2db97..756f9f9e8542 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1265,7 +1265,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) } __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); - update_cfs_shares(cfs_rq); } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1475,8 +1474,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - account_entity_enqueue(cfs_rq, se); enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1549,6 +1549,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1568,8 +1569,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); /* * Normalize the entity after updating the min_vruntime because the @@ -1583,7 +1584,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - se->on_rq = 0; + update_cfs_shares(cfs_rq); } /* @@ -2595,8 +2596,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); - update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -2656,8 +2657,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_cfs_shares(cfs_rq); update_entity_load_avg(se, 1); - update_cfs_rq_blocked_load(cfs_rq, 0); } if (!se) { @@ -5837,11 +5838,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) { + for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); - /* update contribution to parent */ - update_entity_load_avg(se, 1); - } raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From 5e4a08476b50fa39210fca82e03325cc46b9c235 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Dec 2012 07:55:36 -0800 Subject: userns: Require CAP_SYS_ADMIN for most uses of setns. Andy Lutomirski found a nasty little bug in the permissions of setns. With unprivileged user namespaces it became possible to create new namespaces without privilege. However the setns calls were relaxed to only require CAP_SYS_ADMIN in the user nameapce of the targed namespace. Which made the following nasty sequence possible. pid = clone(CLONE_NEWUSER | CLONE_NEWNS); if (pid == 0) { /* child */ system("mount --bind /home/me/passwd /etc/passwd"); } else if (pid != 0) { /* parent */ char path[PATH_MAX]; snprintf(path, sizeof(path), "/proc/%u/ns/mnt"); fd = open(path, O_RDONLY); setns(fd, 0); system("su -"); } Prevent this possibility by requiring CAP_SYS_ADMIN in the current user namespace when joing all but the user namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 3 ++- kernel/utsname.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 560da0dab230..fdbd0cdf271a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -325,7 +325,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = ns; - if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/utsname.c b/kernel/utsname.c index f6336d51d64c..08b197e8c485 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -113,7 +113,8 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) { struct uts_namespace *ns = new; - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); -- cgit v1.2.3 From aa6d054e5ce94720797ca260392a74dbced56412 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Dec 2012 08:50:54 -0800 Subject: userns: Add a more complete capability subset test to commit_creds When unsharing a user namespace we reduce our credentials to just what can be done in that user namespace. This is a subset of the credentials we previously had. Teach commit_creds to recognize this is a subset of the credentials we have had before and don't clear the dumpability flag. This allows an unprivileged program to do: unshare(CLONE_NEWUSER); fd = open("/proc/self/uid_map", O_RDWR); Where previously opening the uid_map writable would fail because the the task had been made non-dumpable. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/cred.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 48cea3da6d05..709d521903f6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -455,6 +455,31 @@ error_put: return ret; } +static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) +{ + const struct user_namespace *set_ns = set->user_ns; + const struct user_namespace *subset_ns = subset->user_ns; + + /* If the two credentials are in the same user namespace see if + * the capabilities of subset are a subset of set. + */ + if (set_ns == subset_ns) + return cap_issubset(subset->cap_permitted, set->cap_permitted); + + /* The credentials are in a different user namespaces + * therefore one is a subset of the other only if a set is an + * ancestor of subset and set->euid is owner of subset or one + * of subsets ancestors. + */ + for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { + if ((set_ns == subset_ns->parent) && + uid_eq(subset_ns->owner, set->euid)) + return true; + } + + return false; +} + /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned @@ -493,7 +518,7 @@ int commit_creds(struct cred *new) !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || - !cap_issubset(new->cap_permitted, old->cap_permitted)) { + !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; -- cgit v1.2.3 From 5155040ed349950e16c093ba8e65ad534994df2a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 9 Dec 2012 17:19:52 -0800 Subject: userns: Fix typo in description of the limitation of userns_install Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f5975ccf9348..2b042c42fbc4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -799,7 +799,7 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) if (user_ns == current_user_ns()) return -EINVAL; - /* Threaded many not enter a different user namespace */ + /* Threaded processes may not enter a different user namespace */ if (atomic_read(¤t->mm->mm_users) > 1) return -EINVAL; -- cgit v1.2.3 From 6133705494bb02953e1e2cc3018a4373981b3c97 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Sun, 16 Dec 2012 22:18:11 -0500 Subject: random: Mix cputime from each thread that exits to the pool When a thread exits mix it's cputime (userspace + kernelspace) to the entropy pool. We don't know how "random" this is, so we use add_device_randomness that doesn't mess with entropy count. Signed-off-by: Nick Kossifidis Signed-off-by: Theodore Ts'o --- kernel/posix-cpu-timers.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..f07827a4e281 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -494,6 +495,8 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers, tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); -- cgit v1.2.3 From 221392c3ad0432e39fd74a349364f66cb0ed78f6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 17 Dec 2012 14:05:53 +0000 Subject: sched: numa: Fix build error if CONFIG_NUMA_BALANCING && !CONFIG_TRANSPARENT_HUGEPAGE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michal Hocko reported that the following build error occurs if CONFIG_NUMA_BALANCING is set without THP support kernel/sched/fair.c: In function ‘task_numa_work’: kernel/sched/fair.c:932:55: error: call to ‘__build_bug_failed’ declared with attribute error: BUILD_BUG failed The problem is that HPAGE_PMD_SHIFT triggers a BUILD_BUG() on !CONFIG_TRANSPARENT_HUGEPAGE. This patch addresses the problem. Reported-by: Michal Hocko Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9af5af979a13..4603d6cb9e25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -929,7 +929,7 @@ void task_numa_work(struct callback_head *work) continue; /* Skip small VMAs. They are not likely to be of relevance */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) continue; do { -- cgit v1.2.3 From 8ec7d50f1ed2b072d23ce810f86df09ee0568d4b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 17 Dec 2012 15:59:36 -0800 Subject: kernel: remove reference to feature-removal-schedule.txt In commit 9c0ece069b32 ("Get rid of Documentation/feature-removal.txt"), Linus removed feature-removal-schedule.txt from Documentation, but there is still some reference to this file. So remove them. Signed-off-by: Tao Ma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - kernel/module.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f34c41bfaa37..571264830a29 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1333,7 +1333,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* See feature-removal-schedule.txt */ if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); diff --git a/kernel/module.c b/kernel/module.c index 6e48c3a43599..808bd62e1723 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -372,9 +372,6 @@ static bool check_symbol(const struct symsearch *syms, printk(KERN_WARNING "Symbol %s is being used " "by a non-GPL module, which will not " "be allowed in the future\n", fsa->name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more details.\n"); } } -- cgit v1.2.3 From 965c8e59cfcf845ecde2265a1d1bfee5f011d302 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Dec 2012 15:59:39 -0800 Subject: lseek: the "whence" argument is called "whence" But the kernel decided to call it "origin" instead. Fix most of the sites. Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index afd092de45b7..3ffe4c5ad3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2675,12 +2675,12 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else file->f_pos = ret = 1; -- cgit v1.2.3 From 0f34c400914f165b7b3812459be2d77b8aa1f1e4 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 17 Dec 2012 15:59:50 -0800 Subject: watchdog: store the watchdog sample period as a variable Currently getting the sample period is always thru a complex calculation: get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5). We can store the sample period as a variable, and set it as __read_mostly type. Signed-off-by: liu chuansheng Cc: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c8c21be11ab4..997c6a16ec22 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_disabled; +static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -116,7 +117,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static u64 get_sample_period(void) +static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -125,7 +126,7 @@ static u64 get_sample_period(void) * and hard thresholds) to increment before the * hardlockup detector generates a warning */ - return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -275,7 +276,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) wake_up_process(__this_cpu_read(softlockup_watchdog)); /* .. and repeat */ - hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); if (touch_ts == 0) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { @@ -356,7 +357,7 @@ static void watchdog_enable(unsigned int cpu) hrtimer->function = watchdog_timer_fn; /* done here because hrtimer_start can only pin to smp_processor_id() */ - hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); /* initialize timestamp */ @@ -386,7 +387,7 @@ static int watchdog_should_run(unsigned int cpu) /* * The watchdog thread function - touches the timestamp. * - * It only runs once every get_sample_period() seconds (4 seconds by + * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). @@ -519,6 +520,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, if (ret || !write) return ret; + set_sample_period(); if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else @@ -540,6 +542,7 @@ static struct smp_hotplug_thread watchdog_threads = { void __init lockup_detector_init(void) { + set_sample_period(); if (smpboot_register_percpu_thread(&watchdog_threads)) { pr_err("Failed to create watchdog threads, disabled\n"); watchdog_disabled = -ENODEV; -- cgit v1.2.3 From 2fa72c8fa5d03c4e07894ccb9f0be72e8687a455 Mon Sep 17 00:00:00 2001 From: Andrew Cooks Date: Mon, 17 Dec 2012 15:59:56 -0800 Subject: printk: boot_delay should only affect output The boot_delay parameter affects all printk(), even if the log level prevents visible output from the call. It results in delays greater than the user intended without purpose. This patch changes the behaviour of boot_delay to only delay output. Signed-off-by: Andrew Cooks Acked-by: Randy Dunlap Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 22e070f3470a..19c0d7bcf24a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -747,6 +747,21 @@ void __init setup_log_buf(int early) free, (free * 100) / __LOG_BUF_LEN); } +static bool __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -770,13 +785,15 @@ static int __init boot_delay_setup(char *str) } __setup("boot_delay=", boot_delay_setup); -static void boot_delay_msec(void) +static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; - if (boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + || (level >= console_loglevel && !ignore_loglevel)) { return; + } k = (unsigned long long)loops_per_msec * boot_delay; @@ -795,7 +812,7 @@ static void boot_delay_msec(void) } } #else -static inline void boot_delay_msec(void) +static inline void boot_delay_msec(int level) { } #endif @@ -1238,21 +1255,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1498,7 +1500,7 @@ asmlinkage int vprintk_emit(int facility, int level, int this_cpu; int printed_len = 0; - boot_delay_msec(); + boot_delay_msec(level); printk_delay(); /* This stops the holder of console_sem just where we want him */ -- cgit v1.2.3 From b2e902f024fa6f6f27b335c478d81bab0cb2c768 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Dec 2012 16:01:27 -0800 Subject: trace: use kbasename() Signed-off-by: Andy Shevchenko Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_uprobe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9614db8b0f8c..c86e6d4f67fb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -263,16 +264,15 @@ static int create_trace_uprobe(int argc, char **argv) /* setup a probe */ if (!event) { - char *tail = strrchr(filename, '/'); + char *tail; char *ptr; - ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); - if (!ptr) { + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { ret = -ENOMEM; goto fail_address_parse; } - tail = ptr; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; -- cgit v1.2.3 From 0ad50c3896afbb3c103409a18260e601b87a744c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 17 Dec 2012 16:01:45 -0800 Subject: compat: generic compat_sys_sched_rr_get_interval() implementation This function is used by sparc, powerpc tile and arm64 for compat support. The patch adds a generic implementation with a wrapper for PowerPC to do the u32->int sign extension. The reason for a single patch covering powerpc, tile, sparc and arm64 is to keep it bisectable, otherwise kernel building may fail with mismatched function declarations. Signed-off-by: Catalin Marinas Acked-by: Chris Metcalf [for tile] Acked-by: David S. Miller Acked-by: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index c28a306ae05c..f6150e92dfc9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1215,6 +1215,23 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } +#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL +asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval) +{ + struct timespec t; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); + set_fs(old_fs); + if (put_compat_timespec(&t, interval)) + return -EFAULT; + return ret; +} +#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ + /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. -- cgit v1.2.3 From 992fb6e170639b0849bace8e49bf31bd37c4123c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Dec 2012 16:03:07 -0800 Subject: ptrace: introduce PTRACE_O_EXITKILL Ptrace jailers want to be sure that the tracee can never escape from the control. However if the tracer dies unexpectedly the tracee continues to run in potentially unsafe mode. Add the new ptrace option PTRACE_O_EXITKILL. If the tracer exits it sends SIGKILL to every tracee which has this bit set. Note that the new option is not equal to the last-option << 1. Because currently all options have an event, and the new one starts the eventless group. It uses the random 20 bit, so we have the room for 12 more events, but we can also add the new eventless options below this one. Suggested by Amnon Shiloh. Signed-off-by: Oleg Nesterov Tested-by: Amnon Shiloh Cc: Denys Vlasenko Cc: Michael Kerrisk Cc: Serge Hallyn Cc: Chris Evans Cc: David Howells Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f5e55dda955..ec8118ab2a47 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -457,6 +457,9 @@ void exit_ptrace(struct task_struct *tracer) return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (unlikely(p->ptrace & PT_EXITKILL)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, p); + if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } -- cgit v1.2.3 From a5ba911ec3792168530d35e16a8ec3b6fc60bcb5 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 17 Dec 2012 16:03:22 -0800 Subject: pidns: remove unused is_container_init() Since commit 1cdcbec1a337 ("CRED: Neuter sys_capset()") is_container_init() has no callers. Signed-off-by: Gao feng Cc: David Howells Acked-by: Serge Hallyn Cc: James Morris Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index fd996c1ed9f8..a54a1123c7cf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -81,21 +81,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). -- cgit v1.2.3 From 19af395d7c0daaafdebd441a162128aaac575912 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 18 Dec 2012 14:21:25 -0800 Subject: irq: tsk->comm is an array The array check is useless so remove it. [akpm@linux-foundation.org: remove comment, per David] Signed-off-by: Alan Cox Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 35c70c9e24d8..e49a288fa479 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused) action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); -- cgit v1.2.3 From 50bdd430c20566b13d8bc59946184b08f5875de6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:04 -0800 Subject: res_counter: return amount of charges after res_counter_uncharge() It is useful to know how many charges are still left after a call to res_counter_uncharge. While it is possible to issue a res_counter_read after uncharge, this can be racy. If we need, for instance, to take some action when the counters drop down to 0, only one of the callers should see it. This is the same semantics as the atomic variables in the kernel. Since the current return value is void, we don't need to worry about anything breaking due to this change: nobody relied on that, and only users appearing from now on will be checking this value. Signed-off-by: Glauber Costa Reviewed-by: Michal Hocko Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 3920d593e63c..ff55247e7049 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) val = counter->usage; counter->usage -= val; + return counter->usage; } -void res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) +u64 res_counter_uncharge_until(struct res_counter *counter, + struct res_counter *top, + unsigned long val) { unsigned long flags; struct res_counter *c; + u64 ret = 0; local_irq_save(flags); for (c = counter; c != top; c = c->parent) { + u64 r; spin_lock(&c->lock); - res_counter_uncharge_locked(c, val); + r = res_counter_uncharge_locked(c, val); + if (c == counter) + ret = r; spin_unlock(&c->lock); } local_irq_restore(flags); + return ret; } -void res_counter_uncharge(struct res_counter *counter, unsigned long val) +u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) { - res_counter_uncharge_until(counter, NULL, val); + return res_counter_uncharge_until(counter, NULL, val); } static inline unsigned long long * -- cgit v1.2.3 From 2ad306b17c0ac5a1b1f250d5f772aeb87fdf1eba Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:18 -0800 Subject: fork: protect architectures where THREAD_SIZE >= PAGE_SIZE against fork bombs Because those architectures will draw their stacks directly from the page allocator, rather than the slab cache, we can directly pass __GFP_KMEMCG flag, and issue the corresponding free_pages. This code path is taken when the architecture doesn't define CONFIG_ARCH_THREAD_INFO_ALLOCATOR (only ia64 seems to), and has THREAD_SIZE >= PAGE_SIZE. Luckily, most - if not all - of the remaining architectures fall in this category. This will guarantee that every stack page is accounted to the memcg the process currently lives on, and will have the allocations to fail if they go over limit. For the time being, I am defining a new variant of THREADINFO_GFP, not to mess with the other path. Once the slab is also tracked by memcg, we can get rid of that flag. Tested to successfully protect against :(){ :|:& };: Signed-off-by: Glauber Costa Acked-by: Frederic Weisbecker Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c36c4e301efe..85f6d536608d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; -- cgit v1.2.3 From 3935e89505a1c3ab3f3b0c7ef0eae54124f48905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 19 Dec 2012 20:51:31 +0100 Subject: watchdog: Fix disable/enable regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8d4516904b39 ("watchdog: Fix CPU hotplug regression") causes an oops or hard lockup when doing echo 0 > /proc/sys/kernel/nmi_watchdog echo 1 > /proc/sys/kernel/nmi_watchdog and the kernel is booted with nmi_watchdog=1 (default) Running laptop-mode-tools and disconnecting/connecting AC power will cause this to trigger, making it a common failure scenario on laptops. Instead of bailing out of watchdog_disable() when !watchdog_enabled we can initialize the hrtimer regardless of watchdog_enabled status. This makes it safe to call watchdog_disable() in the nmi_watchdog=0 case, without the negative effect on the enabled => disabled => enabled case. All these tests pass with this patch: - nmi_watchdog=1 echo 0 > /proc/sys/kernel/nmi_watchdog echo 1 > /proc/sys/kernel/nmi_watchdog - nmi_watchdog=0 echo 0 > /sys/devices/system/cpu/cpu1/online - nmi_watchdog=0 echo mem > /sys/power/state Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=51661 Cc: # v3.7 Cc: Norbert Warmuth Cc: Joseph Salisbury Cc: Thomas Gleixner Signed-off-by: Bjørn Mork Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 997c6a16ec22..75a2ab3d0b02 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -344,6 +344,10 @@ static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + /* kick off the timer for the hardlockup detector */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + if (!watchdog_enabled) { kthread_park(current); return; @@ -352,10 +356,6 @@ static void watchdog_enable(unsigned int cpu) /* Enable the perf event */ watchdog_nmi_enable(cpu); - /* kick off the timer for the hardlockup detector */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; - /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); @@ -369,9 +369,6 @@ static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - if (!watchdog_enabled) - return; - watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); /* disable the perf event */ -- cgit v1.2.3 From ae903caae267154de7cf8576b130ff474630596b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 12:44:11 -0500 Subject: Bury the conditionals from kernel_thread/kernel_execve series All architectures have CONFIG_GENERIC_KERNEL_THREAD CONFIG_GENERIC_KERNEL_EXECVE __ARCH_WANT_SYS_EXECVE None of them have __ARCH_WANT_KERNEL_EXECVE and there are only two callers of kernel_execve() (which is a trivial wrapper for do_execve() now) left. Kill the conditionals and make both callers use do_execve(). Signed-off-by: Al Viro --- kernel/fork.c | 2 -- kernel/kmod.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 540730783433..389712ffc0ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1623,7 +1623,6 @@ long do_fork(unsigned long clone_flags, return nr; } -#ifdef CONFIG_GENERIC_KERNEL_THREAD /* * Create a kernel thread. */ @@ -1632,7 +1631,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } -#endif #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e386831..0023a87e8de6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); if (!retval) return 0; -- cgit v1.2.3 From 5c49574ffd7ac07eae8c3b065d19e6ebc7e4760f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 18 Nov 2012 15:29:16 -0500 Subject: new helper: restore_altstack() to be used by rt_sigreturn instances Signed-off-by: Al Viro --- kernel/signal.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e75e4bd2839b..887f2fefe207 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3103,6 +3103,13 @@ out: return error; } +int restore_altstack(const stack_t __user *uss) +{ + int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + /* squash all but EFAULT for now */ + return err == -EFAULT ? err : 0; +} + #ifdef __ARCH_WANT_SYS_SIGPENDING /** -- cgit v1.2.3 From 6bf9adfc90370b695cb111116e15fdc0e1906270 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 14:09:47 -0500 Subject: introduce generic sys_sigaltstack(), switch x86 and um to it Conditional on CONFIG_GENERIC_SIGALTSTACK; architectures that do not select it are completely unaffected Signed-off-by: Al Viro --- kernel/signal.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 887f2fefe207..f05f4c4150d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3102,6 +3102,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s out: return error; } +#ifdef CONFIG_GENERIC_SIGALTSTACK +SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) +{ + return do_sigaltstack(uss, uoss, current_user_stack_pointer()); +} +#endif int restore_altstack(const stack_t __user *uss) { -- cgit v1.2.3 From 9026843952adac5b123c7b8dc961e5c15828d9e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 14:47:53 -0500 Subject: generic compat_sys_sigaltstack() Again, conditional on CONFIG_GENERIC_SIGALTSTACK Signed-off-by: Al Viro --- kernel/signal.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f05f4c4150d9..aee85bd76b8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -31,6 +31,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -3116,6 +3117,50 @@ int restore_altstack(const stack_t __user *uss) return err == -EFAULT ? err : 0; } +#ifdef CONFIG_COMPAT +#ifdef CONFIG_GENERIC_SIGALTSTACK +asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, + compat_stack_t __user *uoss_ptr) +{ + stack_t uss, uoss; + int ret; + mm_segment_t seg; + + if (uss_ptr) { + compat_stack_t uss32; + + memset(&uss, 0, sizeof(stack_t)); + if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) + return -EFAULT; + uss.ss_sp = compat_ptr(uss32.ss_sp); + uss.ss_flags = uss32.ss_flags; + uss.ss_size = uss32.ss_size; + } + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), + (stack_t __force __user *) &uoss, + compat_user_stack_pointer()); + set_fs(seg); + if (ret >= 0 && uoss_ptr) { + if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || + __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || + __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || + __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + ret = -EFAULT; + } + return ret; +} + +int compat_restore_altstack(const compat_stack_t __user *uss) +{ + int err = compat_sys_sigaltstack(uss, NULL); + /* squash all but -EFAULT for now */ + return err == -EFAULT ? err : 0; +} +#endif +#endif + #ifdef __ARCH_WANT_SYS_SIGPENDING /** -- cgit v1.2.3 From c40702c49faef05ae324f121d8b3e215244ee152 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Nov 2012 14:24:26 -0500 Subject: new helpers: __save_altstack/__compat_save_altstack, switch x86 and um to those note that they are relying on access_ok() already checked by caller. Signed-off-by: Al Viro --- kernel/signal.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index aee85bd76b8a..f072513302c3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3117,6 +3117,14 @@ int restore_altstack(const stack_t __user *uss) return err == -EFAULT ? err : 0; } +int __save_altstack(stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} + #ifdef CONFIG_COMPAT #ifdef CONFIG_GENERIC_SIGALTSTACK asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, @@ -3158,6 +3166,14 @@ int compat_restore_altstack(const compat_stack_t __user *uss) /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } + +int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) +{ + struct task_struct *t = current; + return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | + __put_user(sas_ss_flags(sp), &uss->ss_flags) | + __put_user(t->sas_ss_size, &uss->ss_size); +} #endif #endif -- cgit v1.2.3 From 2832bc19f6668fd00116f61f821105040599ef8b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Dec 2012 17:42:16 -0800 Subject: sched: numa: ksm: fix oops in task_numa_placment() task_numa_placement() oopsed on NULL p->mm when task_numa_fault() got called in the handling of break_ksm() for ksmd. That might be a peculiar case, which perhaps KSM could takes steps to avoid? but it's more robust if task_numa_placement() allows for such a possibility. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4603d6cb9e25..5eea8707234a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -793,8 +793,11 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; static void task_numa_placement(struct task_struct *p) { - int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + int seq; + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); if (p->numa_scan_seq == seq) return; p->numa_scan_seq = seq; -- cgit v1.2.3 From 44fd07e989a9a27476d8963296688127a4fd4df4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 20 Dec 2012 15:05:21 -0800 Subject: kcmp: include linux/ptrace.h This makes it compile on s390. After all the ptrace_may_access (which we use this file) is declared exactly in linux/ptrace.h. This is preparatory work to wire this syscall up on all archs. Signed-off-by: Cyrill Gorcunov Signed-off-by: Alexander Kartashov Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 30b7b225306c..e30ac0fe61c3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From cfde819088422503b5c69e03ab7bb90f87121d4d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Dec 2012 15:05:56 -0800 Subject: keys: use keyring_alloc() to create module signing keyring Use keyring_alloc() to create special keyrings now that it has a permissions parameter rather than using key_alloc() + key_instantiate_and_link(). Signed-off-by: David Howells Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/modsign_pubkey.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 045504fffbb2..2b6e69909c39 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -34,18 +34,15 @@ static __init int module_verify_init(void) { pr_notice("Initialise module verification\n"); - modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + modsign_keyring = keyring_alloc(".module_sign", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(modsign_keyring)) panic("Can't allocate module signing keyring\n"); - if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) - panic("Can't instantiate module signing keyring\n"); - return 0; } -- cgit v1.2.3 From 8382fcac1b813ad0a4e68a838fc7ae93fa39eda0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Dec 2012 19:26:06 -0800 Subject: pidns: Outlaw thread creation after unshare(CLONE_NEWPID) The sequence: unshare(CLONE_NEWPID) clone(CLONE_THREAD|CLONE_SIGHAND|CLONE_VM) Creates a new process in the new pid namespace without setting pid_ns->child_reaper. After forking this results in a NULL pointer dereference. Avoid this and other nonsense scenarios that can show up after creating a new pid namespace with unshare by adding a new check in copy_prodcess. Pointed-out-by: Oleg Nesterov Acked-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a31b823b3c2d..65ca6d27f24e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1166,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); + /* + * If the new process will be in a different pid namespace + * don't allow the creation of threads. + */ + if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; -- cgit v1.2.3 From c876ad7682155958d0c9c27afe9017925c230d64 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Dec 2012 20:27:12 -0800 Subject: pidns: Stop pid allocation when init dies Oleg pointed out that in a pid namespace the sequence. - pid 1 becomes a zombie - setns(thepidns), fork,... - reaping pid 1. - The injected processes exiting. Can lead to processes attempting access their child reaper and instead following a stale pointer. That waitpid for init can return before all of the processes in the pid namespace have exited is also unfortunate. Avoid these problems by disabling the allocation of new pids in a pid namespace when init dies, instead of when the last process in a pid namespace is reaped. Pointed-out-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- kernel/pid.c | 15 ++++++++++++--- kernel/pid_namespace.c | 4 ++++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 36aa02ff17d6..de9af600006f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -270,7 +270,6 @@ void free_pid(struct pid *pid) wake_up_process(ns->child_reaper); break; case 0: - ns->nr_hashed = -1; schedule_work(&ns->proc_work); break; } @@ -319,7 +318,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (ns->nr_hashed < 0) + if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, @@ -342,6 +341,13 @@ out_free: goto out; } +void disable_pid_allocation(struct pid_namespace *ns) +{ + spin_lock_irq(&pidmap_lock); + ns->nr_hashed &= ~PIDNS_HASH_ADDING; + spin_unlock_irq(&pidmap_lock); +} + struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { struct hlist_node *elem; @@ -573,6 +579,9 @@ void __init pidhash_init(void) void __init pidmap_init(void) { + /* Veryify no one has done anything silly */ + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); @@ -584,7 +593,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.nr_hashed = 1; + init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index fdbd0cdf271a..c1c3dc1c6023 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -115,6 +115,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); @@ -181,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; + /* Don't allow any more processes into the pid namespace */ + disable_pid_allocation(pid_ns); + /* Ignore SIGCHLD causing any terminated children to autoreap */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; -- cgit v1.2.3 From 90228fc110303549aa1d4d86083bf585df8624c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 03:33:38 -0500 Subject: switch compat_sys_sigaltstack() to COMPAT_SYSCALL_DEFINE Makes sigaltstack conversion easier to split into per-architecture parts. Signed-off-by: Al Viro --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7aaa51d8e5b8..00b4a6d4449d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3119,8 +3119,9 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) #ifdef CONFIG_COMPAT #ifdef CONFIG_GENERIC_SIGALTSTACK -asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, - compat_stack_t __user *uoss_ptr) +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) { stack_t uss, uoss; int ret; -- cgit v1.2.3 From 8d9807b109497ca41d363dc7b6ff2bb6c0d52524 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 14:56:40 -0500 Subject: switch compat_sys_wait4() and compat_sys_waitid() to COMPAT_SYSCALL_DEFINE Strictly speaking, ppc64 needs it for C ABI compliance. Realistically I would be very surprised if e.g. passing 0xffffffff as 'options' argument to waitid() from 32bit task would cause problems, but yes, it puts us into undefined behaviour territory. ppc64 expects int argument to be passed in 64bit register with bits 31..63 containing the same value. SYSCALL_DEFINE on ppc provides a wrapper that normalizes the value passed from userland; so does COMPAT_SYSCALL_DEFINE. Plain declaration of compat_sys_something() with an int argument obviously doesn't. Again, for wait4 and waitid I would be extremely surprised if gcc started to produce code depending on that value having been properly sign-extended - the argument(s) in question end up passed blindly to sys_wait4 and sys_waitid resp. and normalization for native syscalls takes care of their use there. Still, better to use COMPAT_SYSCALL_DEFINE here than worry about nasal daemons... Signed-off-by: Al Viro --- kernel/compat.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index f6150e92dfc9..0770ac57c62b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) return 0; } -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) { if (!ru) { return sys_wait4(pid, stat_addr, options, NULL); @@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, } } -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, uinfo, int, options, + struct compat_rusage __user *, uru) { siginfo_t info; struct rusage ru; -- cgit v1.2.3 From a566c288826ad4502e43b59570214f18173d7744 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 23:14:49 -0500 Subject: x32: fix waitid() It needs 64bit rusage and 32bit siginfo. glibc never calls it with non-NULL rusage pointer, or we would've seen breakage already... Signed-off-by: Al Viro --- kernel/compat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 0770ac57c62b..e5cc33c7122c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -587,7 +587,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, return ret; if (uru) { - ret = put_compat_rusage(&ru, uru); + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); if (ret) return ret; } -- cgit v1.2.3 From b2ddedcd21f44a5873ee3d6ff6118a2318e01e18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Dec 2012 12:31:00 -0500 Subject: x32: fix sigtimedwait It needs 64bit timespec. As it is, we end up truncating the timeout to whole seconds; usually it doesn't matter, but for having all sub-second timeouts truncated to one jiffy is visibly wrong. Signed-off-by: Al Viro --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e5cc33c7122c..36700e9e2be9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1001,7 +1001,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, sigset_from_compat(&s, &s32); if (uts) { - if (get_compat_timespec(&t, uts)) + if (compat_get_timespec(&t, uts)) return -EFAULT; } -- cgit v1.2.3 From 52441fa8f2f1ccc9fa97607c6ccf8b46b9fd15ae Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Jan 2013 11:09:53 +1030 Subject: module: prevent warning when finit_module a 0 sized file If we try to finit_module on a file sized 0 bytes vmalloc will scream and spit out a warning. Since modules have to be bigger than 0 bytes anyways we can just check that beforehand and avoid the warning. Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- kernel/module.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..41bc1189b061 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2527,6 +2527,13 @@ static int copy_module_from_fd(int fd, struct load_info *info) err = -EFBIG; goto out; } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + err = -EINVAL; + goto out; + } + info->hdr = vmalloc(stat.size); if (!info->hdr) { err = -ENOMEM; -- cgit v1.2.3 From 35dac27cedd14c3b6fcd4ba7bc3c31738cfd1831 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 4 Jan 2013 15:35:50 -0800 Subject: printk: fix incorrect length from print_time() when seconds > 99999 print_prefix() passes a NULL buf to print_time() to get the length of the time prefix; when printk times are enabled, the current code just returns the constant 15, which matches the format "[%5lu.%06lu] " used to print the time value. However, this is obviously incorrect when the whole seconds part of the time gets beyond 5 digits (100000 seconds is a bit more than a day of uptime). The simple fix is to use snprintf(NULL, 0, ...) to calculate the actual length of the time prefix. This could be micro-optimized but it seems better to have simpler, more readable code here. The bug leads to the syslog system call miscomputing which messages fit into the userspace buffer. If there are enough messages to fill log_buf_len and some have a timestamp >= 100000, dmesg may fail with: # dmesg klogctl: Bad address When this happens, strace shows that the failure is indeed EFAULT due to the kernel mistakenly accessing past the end of dmesg's buffer, since dmesg asks the kernel how big a buffer it needs, allocates a bit more, and then gets an error when it asks the kernel to fill it: syslog(0xa, 0, 0) = 1048576 mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa4d25d2000 syslog(0x3, 0x7fa4d25d2010, 0x100008) = -1 EFAULT (Bad address) As far as I can see, the bug has been there as long as print_time(), which comes from commit 084681d14e42 ("printk: flush continuation lines immediately to console") in 3.5-rc5. Signed-off-by: Roland Dreier Signed-off-by: Greg Kroah-Hartman Cc: Joe Perches Cc: Sylvain Munaut Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 19c0d7bcf24a..357f714ddd49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -870,10 +870,11 @@ static size_t print_time(u64 ts, char *buf) if (!printk_time) return 0; + rem_nsec = do_div(ts, 1000000000); + if (!buf) - return 15; + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); - rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); } -- cgit v1.2.3 From 5ba53ff648e785445a32ba39112ed07e4cf588d0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Jan 2013 19:13:13 +0100 Subject: signals: sys_ssetmask() uses uninitialized newmask Commit 77097ae503b1 ("most of set_current_blocked() callers want SIGKILL/SIGSTOP removed from set") removed the initialization of newmask by accident, causing ltp to complain like this: ssetmask01 1 TFAIL : sgetmask() failed: TEST_ERRNO=???(0): Success Restore the proper initialization. Reported-and-tested-by: CAI Qian Signed-off-by: Oleg Nesterov Cc: stable@kernel.org # v3.5+ Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7aaa51d8e5b8..9692499c77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3286,6 +3286,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; + siginitset(&newset, newmask); set_current_blocked(&newset); return old; -- cgit v1.2.3 From 0c4a842349b27d361f1503f6437df303e1b541c9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Jan 2013 19:13:29 +0100 Subject: signals: set_current_blocked() can use __set_current_blocked() Cleanup. And I think we need more cleanups, in particular __set_current_blocked() and sigprocmask() should die. Nobody should ever block SIGKILL or SIGSTOP. - Change set_current_blocked() to use __set_current_blocked() - Change sys_sigprocmask() to use set_current_blocked(), this way it should not worry about SIGKILL/SIGSTOP. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9692499c77cf..372771e948c2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2528,11 +2528,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) */ void set_current_blocked(sigset_t *newset) { - struct task_struct *tsk = current; sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(&tsk->sighand->siglock); - __set_task_blocked(tsk, newset); - spin_unlock_irq(&tsk->sighand->siglock); + __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) @@ -3204,7 +3201,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); new_blocked = current->blocked; @@ -3222,7 +3218,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - __set_current_blocked(&new_blocked); + set_current_blocked(&new_blocked); } if (oset) { -- cgit v1.2.3 From a8dd2176a8e988e3744e863ac39647a6f59fa900 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jan 2013 20:54:17 -0500 Subject: tracing: Fix regression of trace_options file setting The latest change to allow trace options to be set on the command line also broke the trace_options file. The zeroing of the last byte of the option name that is echoed into the trace_option file was removed with the consolidation of some of the code. The compare between the option and what was written to the trace_options file fails because the string holding the data written doesn't terminate with a null character. A zero needs to be added to the end of the string copied from user space. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5125677efa0..1bbfa0446507 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2899,6 +2899,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = 0; + trace_set_options(buf); *ppos += cnt; -- cgit v1.2.3 From bfbbd96c51b441b7a9a08762aa9ab832f6655b2c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 9 Jan 2013 17:12:45 -0800 Subject: audit: fix auditfilter.c kernel-doc warnings Fix new kernel-doc warning in auditfilter.c: Warning(kernel/auditfilter.c:1157): Excess function parameter 'uid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Eric Paris Cc: linux-audit@redhat.com (subscribers-only) Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f19f23d38a3..f9fc54bbe06f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * audit_receive_filter - apply all rules to the specified message type * @type: audit message type * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data -- cgit v1.2.3 From 2df8f8a6a897ebf4c5613b5be6103d33b2a21520 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Jan 2013 16:14:10 -0500 Subject: tracing: Fix regression with irqsoff tracer and tracing_on file Commit 02404baf1b47 "tracing: Remove deprecated tracing_enabled file" removed the tracing_enabled file as it never worked properly and the tracing_on file should be used instead. But the tracing_on file didn't call into the tracers start/stop routines like the tracing_enabled file did. This caused trace-cmd to break when it enabled the irqsoff tracer. If you just did "echo irqsoff > current_tracer" then it would work properly. But the tool trace-cmd disables tracing first by writing "0" into the tracing_on file. Then it writes "irqsoff" into current_tracer and then writes "1" into tracing_on. Unfortunately, the above commit changed the irqsoff tracer to check the tracing_on status instead of the tracing_enabled status. If it's disabled then it does not start the tracer internals. The problem is that writing "1" into tracing_on does not call the tracers "start" routine like writing "1" into tracing_enabled did. This makes the irqsoff tracer not start when using the trace-cmd tool, and is a regression for userspace. Simple fix is to have the tracing_on file call the tracers start() method when being enabled (and the stop() method when disabled). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1bbfa0446507..f3ec1cfb0de1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4817,10 +4817,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - if (val) + mutex_lock(&trace_types_lock); + if (val) { ring_buffer_record_on(buffer); - else + if (current_trace->start) + current_trace->start(tr); + } else { ring_buffer_record_off(buffer); + if (current_trace->stop) + current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); } (*ppos)++; -- cgit v1.2.3 From 1b963c81b14509e330e0fe3218b645ece2738dc5 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 11 Jan 2013 14:31:56 -0800 Subject: lockdep, rwsem: provide down_write_nest_lock() down_write_nest_lock() provides a means to annotate locking scenario where an outer lock is guaranteed to serialize the order nested locks are being acquired. This is analogoue to already existing mutex_lock_nest_lock() and spin_lock_nest_lock(). Signed-off-by: Jiri Kosina Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rwsem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d8..b3c6c3fcd847 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); -- cgit v1.2.3 From 7b9205bd775afc4439ed86d617f9042ee9e76a71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:05 -0800 Subject: audit: create explicit AUDIT_SECCOMP event type The seccomp path was using AUDIT_ANOM_ABEND from when seccomp mode 1 could only kill a process. While we still want to make sure an audit record is forced on a kill, this should use a separate record type since seccomp mode 2 introduces other behaviors. In the case of "handled" behaviors (process wasn't killed), only emit a record if the process is under inspection. This change also fixes userspace examination of seccomp audit events, since it was considered malformed due to missing fields of the AUDIT_ANOM_ABEND event type. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Acked-by: Will Drewry Acked-by: Steve Grubb Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e3..3e46d1dec613 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); +} + +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + audit_log_task(ab); audit_log_format(ab, " reason="); audit_log_string(ab, reason); audit_log_format(ab, " sig=%ld", signr); @@ -2723,8 +2728,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", signr); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + if (unlikely(!ab)) + return; + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_format(ab, " syscall=%ld", syscall); audit_log_format(ab, " compat=%d", is_compat_task()); audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); -- cgit v1.2.3 From 0644ec0cc8a33fb654e348897ad7684e22a4b5d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:07 -0800 Subject: audit: catch possible NULL audit buffers It's possible for audit_log_start() to return NULL. Handle it in the various callers. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Cc: Will Drewry Cc: Steve Grubb Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 4 ++++ kernel/audit_tree.c | 26 +++++++++++++++++--------- kernel/audit_watch.c | 2 ++ kernel/auditsc.c | 6 ++++-- 4 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 40414e9143db..a219998aecc1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old, int rc = 0; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return rc; audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, old, from_kuid(&init_user_ns, loginuid), sessionid); if (sid) { @@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + if (unlikely(!*ab)) + return rc; audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", task_tgid_vnr(current), from_kuid(&init_user_ns, current_uid()), diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e81175ef25f8..642a89c4f3d6 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -449,11 +449,26 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } +static void audit_log_remove_rule(struct audit_krule *rule) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); + audit_log_untrustedstring(ab, rule->tree->pathname); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; - struct audit_buffer *ab; list_for_each_entry_safe(rule, next, &tree->rules, rlist) { entry = container_of(rule, struct audit_entry, rule); @@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); - audit_log_format(ab, " dir="); - audit_log_untrustedstring(ab, rule->tree->pathname); - audit_log_key(ab, rule->filterkey); - audit_log_format(ab, " list=%d res=1", rule->listnr); - audit_log_end(ab); + audit_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4a599f699adc..22831c4d369c 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (audit_enabled) { struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; audit_log_format(ab, "auid=%u ses=%u op=", from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3e46d1dec613..a371f857a0a9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); + if (unlikely(!ab)) + return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); - if (!ab) - return; } break; } case AUDIT_MQ_OPEN: { @@ -2720,6 +2720,8 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + if (unlikely(!ab)) + return; audit_log_abend(ab, "memory violation", signr); audit_log_end(ab); } -- cgit v1.2.3 From 829199197a430dade2519d54f5545c4a094393b8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 11 Jan 2013 14:32:11 -0800 Subject: kernel/audit.c: avoid negative sleep durations audit_log_start() performs the same jiffies comparison in two places. If sufficient time has elapsed between the two comparisons, the second one produces a negative sleep duration: schedule_timeout: wrong timeout value fffffffffffffff0 Pid: 6606, comm: trinity-child1 Not tainted 3.8.0-rc1+ #43 Call Trace: schedule_timeout+0x305/0x340 audit_log_start+0x311/0x470 audit_log_exit+0x4b/0xfb0 __audit_syscall_exit+0x25f/0x2c0 sysret_audit+0x17/0x21 Fix it by performing the comparison a single time. Reported-by: Dave Jones Cc: Al Viro Cc: Eric Paris Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a219998aecc1..d596e5355f15 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } +/* + * Wait for auditd to drain the queue a little + */ +static void wait_for_auditd(unsigned long sleep_time) +{ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(sleep_time); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); +} + /* Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to * audit_log_*format. If the tsk is a task that is currently in a @@ -1146,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time - && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { - - /* Wait for auditd to drain the queue a little */ - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&audit_backlog_wait, &wait); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) - schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + unsigned long sleep_time; - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + sleep_time = timeout_start + audit_backlog_wait_time - + jiffies; + if ((long)sleep_time > 0) + wait_for_auditd(sleep_time); continue; } if (audit_rate_check() && printk_ratelimit()) -- cgit v1.2.3 From 0d21b0e3477395e7ff2acc269f15df6e6a8d356d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 12 Jan 2013 11:38:44 +1030 Subject: module: add new state MODULE_STATE_UNFORMED. You should never look at such a module, so it's excised from all paths which traverse the modules list. We add the state at the end, to avoid gratuitous ABI break (ksplice). Signed-off-by: Rusty Russell --- kernel/debug/kdb/kdb_main.c | 2 ++ kernel/module.c | 57 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f3..8875254120b6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("Module Size modstruct Used by\n"); list_for_each_entry(mod, kdb_modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); diff --git a/kernel/module.c b/kernel/module.c index 41bc1189b061..c3a2ee8e3679 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -188,6 +188,7 @@ struct load_info { ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) { + BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); if (mod && mod->state == MODULE_STATE_COMING) return -EBUSY; if (try_module_get(mod)) @@ -343,6 +344,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) return true; } @@ -450,16 +454,24 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) +static struct module *find_module_all(const char *name, + bool even_unformed) { struct module *mod; list_for_each_entry(mod, &modules, list) { + if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) + continue; if (strcmp(mod->name, name) == 0) return mod; } return NULL; } + +struct module *find_module(const char *name) +{ + return find_module_all(name, false); +} EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP @@ -525,6 +537,8 @@ bool is_module_percpu_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (!mod->percpu_size) continue; for_each_possible_cpu(cpu) { @@ -1048,6 +1062,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, case MODULE_STATE_GOING: state = "going"; break; + default: + BUG(); } return sprintf(buffer, "%s\n", state); } @@ -1786,6 +1802,8 @@ void set_all_modules_text_rw(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -1807,6 +1825,8 @@ void set_all_modules_text_ro(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -2998,7 +3018,8 @@ static bool finished_loading(const char *name) mutex_lock(&module_mutex); mod = find_module(name); - ret = !mod || mod->state != MODULE_STATE_COMING; + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -3361,6 +3382,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { if (modname) @@ -3384,6 +3407,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3408,6 +3433,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3435,6 +3462,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -3477,9 +3506,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((ret = mod_find_symname(mod, name)) != 0) break; + } } preempt_enable(); return ret; @@ -3494,6 +3526,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, int ret; list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; for (i = 0; i < mod->num_symtab; i++) { ret = fn(data, mod->strtab + mod->symtab[i].st_name, mod, mod->symtab[i].st_value); @@ -3509,6 +3543,7 @@ static char *module_flags(struct module *mod, char *buf) { int bx = 0; + BUG_ON(mod->state == MODULE_STATE_UNFORMED); if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { @@ -3550,6 +3585,10 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -3610,6 +3649,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (mod->num_exentries == 0) continue; @@ -3658,10 +3699,13 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_core(addr, mod) || within_module_init(addr, mod)) return mod; + } return NULL; } EXPORT_SYMBOL_GPL(__module_address); @@ -3714,8 +3758,11 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; printk(" %s%s", mod->name, module_flags(mod, buf)); + } preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); -- cgit v1.2.3 From 1fb9341ac34825aa40354e74d9a2c69df7d2c304 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 12 Jan 2013 13:27:34 +1030 Subject: module: put modules in list much earlier. Prarit's excellent bug report: > In recent Fedora releases (F17 & F18) some users have reported seeing > messages similar to > > [ 15.478160] kvm: Could not allocate 304 bytes percpu data > [ 15.478174] PERCPU: allocation failed, size=304 align=32, alloc from > reserved chunk failed > > during system boot. In some cases, users have also reported seeing this > message along with a failed load of other modules. > > What is happening is systemd is loading an instance of the kvm module for > each cpu found (see commit e9bda3b). When the module load occurs the kernel > currently allocates the modules percpu data area prior to checking to see > if the module is already loaded or is in the process of being loaded. If > the module is already loaded, or finishes load, the module loading code > releases the current instance's module's percpu data. Now we have a new state MODULE_STATE_UNFORMED, we can insert the module into the list (and thus guarantee its uniqueness) before we allocate the per-cpu region. Reported-by: Prarit Bhargava Signed-off-by: Rusty Russell Tested-by: Prarit Bhargava --- kernel/module.c | 90 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c3a2ee8e3679..ec535aa63c98 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3017,7 +3017,7 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module(name); + mod = find_module_all(name, true); ret = !mod || mod->state == MODULE_STATE_LIVE || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); @@ -3141,6 +3141,32 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + /* + * We try to place it in the list now to make sure it's unique + * before we dedicate too many resources. In particular, + * temporary percpu memory exhaustion. + */ + mod->state = MODULE_STATE_UNFORMED; +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_module; + goto again; + } + err = -EEXIST; + mutex_unlock(&module_mutex); + goto free_module; + } + list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); + #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) @@ -3150,7 +3176,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) - goto free_module; + goto unlink_mod; /* Now we've got everything in the final locations, we can * find optional sections. */ @@ -3185,54 +3211,33 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } - /* Mark state as coming so strong_try_module_get() ignores us. */ - mod->state = MODULE_STATE_COMING; - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. No one should access us, since - * strong_try_module_get() will fail. - * lockdep/oops can run asynchronous, so use the RCU list insertion - * function to insert in a way safe to concurrent readers. - * The mutex protects against concurrent writers. - */ -again: - mutex_lock(&module_mutex); - if ((old = find_module(mod->name)) != NULL) { - if (old->state == MODULE_STATE_COMING) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_arch_cleanup; - goto again; - } - err = -EEXIST; - goto unlock; - } - - /* This has to be done once we're sure module name is unique. */ dynamic_debug_setup(info->debug, info->num_debug); - /* Find duplicate symbols */ + mutex_lock(&module_mutex); + /* Find duplicate symbols (must be called under lock). */ err = verify_export_symbols(mod); if (err < 0) - goto ddebug; + goto ddebug_cleanup; + /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - list_add_rcu(&mod->list, &modules); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) - goto unlink; + goto bug_cleanup; /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto unlink; + goto bug_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3242,16 +3247,13 @@ again: return do_init_module(mod); - unlink: + bug_cleanup: + /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); module_bug_cleanup(mod); - wake_up_all(&module_wq); - ddebug: - dynamic_debug_remove(info->debug); - unlock: mutex_unlock(&module_mutex); + ddebug_cleanup: + dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: @@ -3260,6 +3262,12 @@ again: free_modinfo(mod); free_unload: module_unload_free(mod); + unlink_mod: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + wake_up_all(&module_wq); + mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); free_copy: -- cgit v1.2.3 From 250bfd3d8e7e19cb649dd94689f0af2ce3474060 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Jan 2013 10:54:11 +0800 Subject: tracing: Fix regression of trace_pipe Commit 0fb9656d "tracing: Make tracing_enabled be equal to tracing_on" changes the behaviour of trace_pipe, ie. it makes trace_pipe return if we've read something and tracing is enabled, and this means that we have to 'cat trace_pipe' again and again while running tests. IMO the right way is if tracing is enabled, we always block and wait for ring buffer, or we may lose what we want since ring buffer's size is limited. Link: http://lkml.kernel.org/r/1358132051-5410-1-git-send-email-bo.li.liu@oracle.com Signed-off-by: Liu Bo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3ec1cfb0de1..3c13e46d7d24 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3454,7 +3454,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is enabled. + * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3462,7 +3462,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (tracing_is_enabled() && iter->pos) + if (!tracing_is_enabled() && iter->pos) break; } -- cgit v1.2.3 From 774a1221e862b343388347bac9b318767336b20b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Jan 2013 18:52:51 -0800 Subject: module, async: async_synchronize_full() on module init iff async is used If the default iosched is built as module, the kernel may deadlock while trying to load the iosched module on device probe if the probing was running off async. This is because async_synchronize_full() at the end of module init ends up waiting for the async job which initiated the module loading. async A modprobe 1. finds a device 2. registers the block device 3. request_module(default iosched) 4. modprobe in userland 5. load and init module 6. async_synchronize_full() Async A waits for modprobe to finish in request_module() and modprobe waits for async A to finish in async_synchronize_full(). Because there's no easy to track dependency once control goes out to userland, implementing properly nested flushing is difficult. For now, make module init perform async_synchronize_full() iff module init has queued async jobs as suggested by Linus. This avoids the described deadlock because iosched module doesn't use async and thus wouldn't invoke async_synchronize_full(). This is hacky and incomplete. It will deadlock if async module loading nests; however, this works around the known problem case and seems to be the best of bad options. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Tested-by: Ming Lei Tested-by: Alex Riesen Cc: Arjan van de Ven Cc: Jens Axboe Signed-off-by: Linus Torvalds --- kernel/async.c | 3 +++ kernel/module.c | 27 +++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..a1d585c351d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -196,6 +196,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..b10b048367e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3013,6 +3013,12 @@ static int do_init_module(struct module *mod) { int ret = 0; + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -3058,8 +3064,25 @@ static int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3 From b1e0318b8cd4bdbb0fbc48967b0350483ad9bd69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jan 2013 22:13:34 -0500 Subject: sys_clone() needs asmlinkage_protect Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- kernel/fork.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a31b823b3c2d..e05cff2429b5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1660,8 +1660,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, 0, - parent_tidptr, child_tidptr); + long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + asmlinkage_protect(5, ret, clone_flags, newsp, + parent_tidptr, child_tidptr, tls_val); + return ret; } #endif -- cgit v1.2.3 From edea0d03ee5f0ae0051b6adb6681ebdf976b1ca4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 20 Jan 2013 20:25:47 +0100 Subject: ia64: kill thread_matches(), unexport ptrace_check_attach() The ia64 function "thread_matches()" has no users since commit e868a55c2a8c ("[IA64] remove find_thread_for_addr()"). Remove it. This allows us to make ptrace_check_attach() static to kernel/ptrace.c, which is good since we'll need to change the semantics of it and fix up all the callers. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1599157336a6..612a56126851 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -139,7 +139,7 @@ void __ptrace_unlink(struct task_struct *child) * RETURNS: * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, bool ignore_state) +static int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; -- cgit v1.2.3 From ee61abb3223e28a1a14a8429c0319755d20d3e40 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Jan 2013 20:22:58 -0800 Subject: module: fix missing module_mutex unlock Commit 1fb9341ac348 ("module: put modules in list much earlier") moved some of the module initialization code around, and in the process changed the exit paths too. But for the duplicate export symbol error case the change made the ddebug_cleanup path jump to after the module mutex unlock, even though it happens with the mutex held. Rusty has some patches to split this function up into some helper functions, hopefully the mess of complex goto targets will go away eventually. Reported-by: Dan Carpenter Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d25e359279ae..eab08274ec9b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3274,8 +3274,8 @@ again: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); - mutex_unlock(&module_mutex); ddebug_cleanup: + mutex_unlock(&module_mutex); dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); -- cgit v1.2.3 From c1bf08ac26e92122faab9f6c32ea8aba94612dae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Dec 2012 09:48:15 -0500 Subject: ftrace: Be first to run code modification on modules If some other kernel subsystem has a module notifier, and adds a kprobe to a ftrace mcount point (now that kprobes work on ftrace points), when the ftrace notifier runs it will fail and disable ftrace, as well as kprobes that are attached to ftrace points. Here's the error: WARNING: at kernel/trace/ftrace.c:1618 ftrace_bug+0x239/0x280() Hardware name: Bochs Modules linked in: fat(+) stap_56d28a51b3fe546293ca0700b10bcb29__8059(F) nfsv4 auth_rpcgss nfs dns_resolver fscache xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack lockd sunrpc ppdev parport_pc parport microcode virtio_net i2c_piix4 drm_kms_helper ttm drm i2c_core [last unloaded: bid_shared] Pid: 8068, comm: modprobe Tainted: GF 3.7.0-0.rc8.git0.1.fc19.x86_64 #1 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] ? __probe_kernel_read+0x46/0x70 [] ? 0xffffffffa017ffff [] ? 0xffffffffa017ffff [] warn_slowpath_null+0x1a/0x20 [] ftrace_bug+0x239/0x280 [] ftrace_process_locs+0x376/0x520 [] ftrace_module_notify+0x47/0x50 [] notifier_call_chain+0x4d/0x70 [] __blocking_notifier_call_chain+0x58/0x80 [] blocking_notifier_call_chain+0x16/0x20 [] sys_init_module+0x73/0x220 [] system_call_fastpath+0x16/0x1b ---[ end trace 9ef46351e53bbf80 ]--- ftrace failed to modify [] init_once+0x0/0x20 [fat] actual: cc:bb:d2:4b:e1 A kprobe was added to the init_once() function in the fat module on load. But this happened before ftrace could have touched the code. As ftrace didn't run yet, the kprobe system had no idea it was a ftrace point and simply added a breakpoint to the code (0xcc in the cc:bb:d2:4b:e1). Then when ftrace went to modify the location from a call to mcount/fentry into a nop, it didn't see a call op, but instead it saw the breakpoint op and not knowing what to do with it, ftrace shut itself down. The solution is to simply give the ftrace module notifier the max priority. This should have been done regardless, as the core code ftrace modification also happens very early on in boot up. This makes the module modification closer to core modification. Link: http://lkml.kernel.org/r/20130107140333.593683061@goodmis.org Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Reported-by: Frank Ch. Eigler Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ffe4c5ad3f3..41473b4ad7a4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3998,7 +3998,7 @@ static int ftrace_module_notify(struct notifier_block *self, struct notifier_block ftrace_module_nb = { .notifier_call = ftrace_module_notify, - .priority = 0, + .priority = INT_MAX, /* Run before anything that can use kprobes */ }; extern unsigned long __start_mcount_loc[]; -- cgit v1.2.3 From 910ffdb18a6408e14febbb6e4b6840fd2c928c82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:47:41 +0100 Subject: ptrace: introduce signal_wake_up_state() and ptrace_signal_wake_up() Cleanup and preparation for the next change. signal_wake_up(resume => true) is overused. None of ptrace/jctl callers actually want to wakeup a TASK_WAKEKILL task, but they can't specify the necessary mask. Turn signal_wake_up() into signal_wake_up_state(state), reintroduce signal_wake_up() as a trivial helper, and add ptrace_signal_wake_up() which adds __TASK_TRACED. This way ptrace_signal_wake_up() can work "inside" ptrace_request() even if the tracee doesn't have the TASK_WAKEKILL bit set. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 8 ++++---- kernel/signal.c | 14 ++++---------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 612a56126851..62f7c2774b16 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); + ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } @@ -317,7 +317,7 @@ static int ptrace_attach(struct task_struct *task, long request, */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); + signal_wake_up_state(task, __TASK_STOPPED); spin_unlock(&task->sighand->siglock); @@ -737,7 +737,7 @@ int ptrace_request(struct task_struct *child, long request, * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; @@ -763,7 +763,7 @@ int ptrace_request(struct task_struct *child, long request, * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); + ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); diff --git a/kernel/signal.c b/kernel/signal.c index 53cd5c4d1172..6e97aa6fa32c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * No need to set need_resched since signal event passing * goes through ->blocked */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state) { - unsigned int mask; - set_tsk_thread_flag(t, TIF_SIGPENDING); - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable + * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) + if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* -- cgit v1.2.3 From 9899d11f654474d2d54ea52ceaa2a1f4db3abd68 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:48:00 +0100 Subject: ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL putreg() assumes that the tracee is not running and pt_regs_access() can safely play with its stack. However a killed tracee can return from ptrace_stop() to the low-level asm code and do RESTORE_REST, this means that debugger can actually read/modify the kernel stack until the tracee does SAVE_REST again. set_task_blockstep() can race with SIGKILL too and in some sense this race is even worse, the very fact the tracee can be woken up breaks the logic. As Linus suggested we can clear TASK_WAKEKILL around the arch_ptrace() call, this ensures that nobody can ever wakeup the tracee while the debugger looks at it. Not only this fixes the mentioned problems, we can do some cleanups/simplifications in arch_ptrace() paths. Probably ptrace_unfreeze_traced() needs more callers, for example it makes sense to make the tracee killable for oom-killer before access_process_vm(). While at it, add the comment into may_ptrace_stop() to explain why ptrace_stop() still can't rely on SIGKILL and signal_pending_state(). Reported-by: Salman Qazi Reported-by: Suleiman Souhlal Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++--------- kernel/signal.c | 5 +++++ 2 files changed, 59 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 62f7c2774b16..6cbeaae4406d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -122,6 +122,40 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ + bool ret = false; + + /* Lockless, nobody but us can set this flag */ + if (task->jobctl & JOBCTL_LISTENING) + return ret; + + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !__fatal_signal_pending(task)) { + task->state = __TASK_TRACED; + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ + if (task->state != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); + + spin_lock_irq(&task->sighand->siglock); + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + spin_unlock_irq(&task->sighand->siglock); +} + /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for @@ -151,24 +185,29 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { + if (child->ptrace && child->parent == current) { + WARN_ON(child->state == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) + if (ignore_state || ptrace_freeze_traced(child)) ret = 0; - spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + if (!ret && !ignore_state) { + if (!wait_task_inactive(child, __TASK_TRACED)) { + /* + * This can only happen if may_ptrace_stop() fails and + * ptrace_stop() changes ->state back to TASK_RUNNING, + * so we should not worry about leaking __TASK_TRACED. + */ + WARN_ON(child->state == __TASK_TRACED); + ret = -ESRCH; + } + } - /* All systems go.. */ return ret; } @@ -900,6 +939,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); @@ -1039,8 +1080,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); - if (!ret) + if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); + } out_put_task_struct: put_task_struct(child); diff --git a/kernel/signal.c b/kernel/signal.c index 6e97aa6fa32c..3d09cf6cde75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1794,6 +1794,10 @@ static inline int may_ptrace_stop(void) * If SIGKILL was already sent before the caller unlocked * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). + * + * This is almost outdated, a task with the pending SIGKILL can't + * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported + * after SIGKILL was already dequeued. */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) @@ -1919,6 +1923,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done) do_notify_parent_cldstop(current, false, why); + /* tasklist protects us from ptrace_freeze_traced() */ __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; -- cgit v1.2.3 From 9067ac85d533651b98c2ff903182a20cbb361fcb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:48:17 +0100 Subject: wake_up_process() should be never used to wakeup a TASK_STOPPED/TRACED task wake_up_process() should never wakeup a TASK_STOPPED/TRACED task. Change it to use TASK_NORMAL and add the WARN_ON(). TASK_ALL has no other users, probably can be killed. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..26058d0bebba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1523,7 +1523,8 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); -- cgit v1.2.3 From f56c3196f251012de9b3ebaff55732a9074fdaae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Jan 2013 16:15:15 -0800 Subject: async: fix __lowest_in_progress() Commit 083b804c4d3e ("async: use workqueue for worker pool") made it possible that async jobs are moved from pending to running out-of-order. While pending async jobs will be queued and dispatched for execution in the same order, nothing guarantees they'll enter "1) move self to the running queue" of async_run_entry_fn() in the same order. Before the conversion, async implemented its own worker pool. An async worker, upon being woken up, fetches the first item from the pending list, which kept the executing lists sorted. The conversion to workqueue was done by adding work_struct to each async_entry and async just schedules the work item. The queueing and dispatching of such work items are still in order but now each worker thread is associated with a specific async_entry and moves that specific async_entry to the executing list. So, depending on which worker reaches that point earlier, which is non-deterministic, we may end up moving an async_entry with larger cookie before one with smaller one. This broke __lowest_in_progress(). running->domain may not be properly sorted and is not guaranteed to contain lower cookies than pending list when not empty. Fix it by ensuring sort-inserting to the running list and always looking at both pending and running when trying to determine the lowest cookie. Over time, the async synchronization implementation became quite messy. We better restructure it such that each async_entry is linked to two lists - one global and one per domain - and not move it when execution starts. There's no reason to distinguish pending and running. They behave the same for synchronization purposes. Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/async.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index a1d585c351d6..6f34904a0b53 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -86,18 +86,27 @@ static atomic_t entry_count; */ static async_cookie_t __lowest_in_progress(struct async_domain *running) { + async_cookie_t first_running = next_cookie; /* infinity value */ + async_cookie_t first_pending = next_cookie; /* ditto */ struct async_entry *entry; + /* + * Both running and pending lists are sorted but not disjoint. + * Take the first cookies from both and return the min. + */ if (!list_empty(&running->domain)) { entry = list_first_entry(&running->domain, typeof(*entry), list); - return entry->cookie; + first_running = entry->cookie; } - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) - return entry->cookie; + list_for_each_entry(entry, &async_pending, list) { + if (entry->running == running) { + first_pending = entry->cookie; + break; + } + } - return next_cookie; /* "infinity" value */ + return min(first_running, first_pending); } static async_cookie_t lowest_in_progress(struct async_domain *running) @@ -118,13 +127,17 @@ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); + struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; struct async_domain *running = entry->running; - /* 1) move self to the running queue */ + /* 1) move self to the running queue, make sure it stays sorted */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, &running->domain); + list_for_each_entry_reverse(pos, &running->domain, list) + if (entry->cookie < pos->cookie) + break; + list_move_tail(&entry->list, &pos->list); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ -- cgit v1.2.3