diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/auditfilter.c | 2 | ||||
-rw-r--r-- | kernel/exit.c | 31 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/futex.c | 269 | ||||
-rw-r--r-- | kernel/futex_compat.c | 9 | ||||
-rw-r--r-- | kernel/irq/spurious.c | 46 | ||||
-rw-r--r-- | kernel/kallsyms.c | 3 | ||||
-rw-r--r-- | kernel/kthread.c | 7 | ||||
-rw-r--r-- | kernel/power/disk.c | 3 | ||||
-rw-r--r-- | kernel/power/main.c | 19 | ||||
-rw-r--r-- | kernel/power/process.c | 57 | ||||
-rw-r--r-- | kernel/power/swap.c | 2 | ||||
-rw-r--r-- | kernel/profile.c | 1 | ||||
-rw-r--r-- | kernel/rtmutex.c | 24 | ||||
-rw-r--r-- | kernel/sched.c | 4 | ||||
-rw-r--r-- | kernel/signal.c | 38 | ||||
-rw-r--r-- | kernel/sysctl.c | 2 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 10 | ||||
-rw-r--r-- | kernel/time/ntp.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 17 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 28 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/time/timer_stats.c | 44 | ||||
-rw-r--r-- | kernel/timer.c | 12 | ||||
-rw-r--r-- | kernel/workqueue.c | 84 |
25 files changed, 457 insertions, 268 deletions
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6c61263ff96d..74cc0fc6bb81 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -311,6 +311,7 @@ int audit_match_class(int class, unsigned syscall) return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); } +#ifdef CONFIG_AUDITSYSCALL static inline int audit_match_class_bits(int class, u32 *mask) { int i; @@ -347,6 +348,7 @@ static int audit_match_signal(struct audit_entry *entry) return 1; } } +#endif /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) diff --git a/kernel/exit.c b/kernel/exit.c index c6d14b8008dd..5c8ecbaa19a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -762,11 +762,8 @@ static void exit_notify(struct task_struct *tsk) read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); for (t = next_thread(tsk); t != tsk; t = next_thread(t)) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) { - recalc_sigpending_tsk(t); - if (signal_pending(t)) - signal_wake_up(t, 0); - } + if (!signal_pending(t) && !(t->flags & PF_EXITING)) + recalc_sigpending_and_wake(t); spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); } @@ -895,13 +892,29 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + spin_lock_irq(&tsk->pi_lock); tsk->flags |= PF_EXITING; + spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -915,7 +928,7 @@ fastcall NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - hrtimer_cancel(&tsk->signal->real_timer); + hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } acct_collect(code, group_dead); @@ -968,6 +981,12 @@ fastcall NORET_TYPE void do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(); diff --git a/kernel/fork.c b/kernel/fork.c index 49530e40ea8b..73ad5cda1bcd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -45,6 +45,7 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> +#include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/random.h> @@ -1405,7 +1406,9 @@ long do_fork(unsigned long clone_flags, } if (clone_flags & CLONE_VFORK) { + freezer_do_not_count(); wait_for_completion(&vfork); + freezer_count(); if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); @@ -1427,10 +1430,8 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, { struct sighand_struct *sighand = data; - if (flags & SLAB_CTOR_CONSTRUCTOR) { - spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); - } + spin_lock_init(&sighand->siglock); + INIT_LIST_HEAD(&sighand->signalfd_list); } void __init proc_caches_init(void) diff --git a/kernel/futex.c b/kernel/futex.c index b7ce15c67e32..3b7f7713d9a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -430,10 +430,6 @@ static struct task_struct * futex_find_get_task(pid_t pid) p = NULL; goto out_unlock; } - if (p->exit_state != 0) { - p = NULL; - goto out_unlock; - } get_task_struct(p); out_unlock: rcu_read_unlock(); @@ -502,7 +498,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *this, *next; struct plist_head *head; struct task_struct *p; - pid_t pid; + pid_t pid = uval & FUTEX_TID_MASK; head = &hb->chain; @@ -520,6 +516,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(pid && pi_state->owner && + pi_state->owner->pid != pid); atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -530,15 +528,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when the owner died bit is set - * and TID = 0: + * the new pi_state to it, but bail out when TID = 0 */ - pid = uval & FUTEX_TID_MASK; - if (!pid && (uval & FUTEX_OWNER_DIED)) + if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (!p) - return -ESRCH; + if (IS_ERR(p)) + return PTR_ERR(p); + + /* + * We need to look at the task state flags to figure out, + * whether the task is exiting. To protect against the do_exit + * change of the task flags, we do this protected by + * p->pi_lock: + */ + spin_lock_irq(&p->pi_lock); + if (unlikely(p->flags & PF_EXITING)) { + /* + * The task is on the way out. When PF_EXITPIDONE is + * set, we know that the task has finished the + * cleanup: + */ + int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + + spin_unlock_irq(&p->pi_lock); + put_task_struct(p); + return ret; + } pi_state = alloc_pi_state(); @@ -551,7 +567,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, /* Store the key for possible exit cleanups: */ pi_state->key = *key; - spin_lock_irq(&p->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; @@ -618,6 +633,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * preserve the owner died bit.) */ if (!(uval & FUTEX_OWNER_DIED)) { + int ret = 0; + newval = FUTEX_WAITERS | new_owner->pid; /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ newval |= (uval & FUTEX_WAITER_REQUEUED); @@ -625,10 +642,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); pagefault_enable(); + if (curval == -EFAULT) - return -EFAULT; + ret = -EFAULT; if (curval != uval) - return -EINVAL; + ret = -EINVAL; + if (ret) { + spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; + } } spin_lock_irq(&pi_state->owner->pi_lock); @@ -1174,7 +1196,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, #ifdef CONFIG_DEBUG_PI_LIST this->list.plist.lock = &hb2->lock; #endif - } + } this->key = key2; get_futex_key_refs(&key2); drop_count++; @@ -1326,12 +1348,10 @@ static void unqueue_me_pi(struct futex_q *q) /* * Fixup the pi_state owner with current. * - * The cur->mm semaphore must be held, it is released at return of this - * function. + * Must be called with hash bucket lock held and mm->sem held for non + * private futexes. */ -static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, - struct futex_q *q, - struct futex_hash_bucket *hb, +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *curr) { u32 newtid = curr->pid | FUTEX_WAITERS; @@ -1355,23 +1375,24 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, list_add(&pi_state->list, &curr->pi_state_list); spin_unlock_irq(&curr->pi_lock); - /* Unqueue and drop the lock */ - unqueue_me_pi(q); - if (fshared) - up_read(fshared); /* * We own it, so we have to replace the pending owner * TID. This must be atomic as we have preserve the * owner died bit here. */ - ret = get_user(uval, uaddr); + ret = get_futex_value_locked(&uval, uaddr); + while (!ret) { newval = (uval & FUTEX_OWNER_DIED) | newtid; newval |= (uval & FUTEX_WAITER_REQUEUED); + + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); + if (curval == -EFAULT) - ret = -EFAULT; + ret = -EFAULT; if (curval == uval) break; uval = curval; @@ -1553,10 +1574,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, */ uaddr = q.pi_state->key.uaddr; - /* mmap_sem and hash_bucket lock are unlocked at - return of this function */ - ret = fixup_pi_state_owner(uaddr, fshared, - &q, hb, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr); } else { /* * Catch the rare case, where the lock was released @@ -1567,12 +1585,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (rt_mutex_trylock(&q.pi_state->pi_mutex)) ret = 0; } - /* Unqueue and drop the lock */ - unqueue_me_pi(&q); - if (fshared) - up_read(fshared); } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); + debug_rt_mutex_free_waiter(&q.waiter); return ret; @@ -1688,7 +1707,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, struct futex_hash_bucket *hb; u32 uval, newval, curval; struct futex_q q; - int ret, lock_held, attempt = 0; + int ret, lock_taken, ownerdied = 0, attempt = 0; if (refill_pi_state_cache()) return -ENOMEM; @@ -1709,10 +1728,11 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret != 0)) goto out_release_sem; + retry_unlocked: hb = queue_lock(&q, -1, NULL); retry_locked: - lock_held = 0; + ret = lock_taken = 0; /* * To avoid races, we attempt to take the lock here again @@ -1728,43 +1748,44 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(curval == -EFAULT)) goto uaddr_faulted; - /* We own the lock already */ + /* + * Detect deadlocks. In case of REQUEUE_PI this is a valid + * situation and we return success to user space. + */ if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { - if (!detect && 0) - force_sig(SIGKILL, current); - /* - * Normally, this check is done in user space. - * In case of requeue, the owner may attempt to lock this futex, - * even if the ownership has already been given by the previous - * waker. - * In the usual case, this is a case of deadlock, but not in case - * of REQUEUE_PI. - */ if (!(curval & FUTEX_WAITER_REQUEUED)) ret = -EDEADLK; goto out_unlock_release_sem; } /* - * Surprise - we got the lock. Just return - * to userspace: + * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) goto out_unlock_release_sem; uval = curval; + /* - * In case of a requeue, check if there already is an owner - * If not, just take the futex. + * Set the WAITERS flag, so the owner will know it has someone + * to wake at next unlock */ - if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { - /* set current as futex owner */ - newval = curval | current->pid; - lock_held = 1; - } else - /* Set the WAITERS flag, so the owner will know it has someone - to wake at next unlock */ - newval = curval | FUTEX_WAITERS; + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED or REQUEUE. We take over the + * futex in this case. We also do an unconditional take over, + * when the owner of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED and REQUEUE bits */ + newval = (curval & ~FUTEX_TID_MASK) | current->pid; + ownerdied = 0; + lock_taken = 1; + } pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -1775,8 +1796,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(curval != uval)) goto retry_locked; - if (lock_held) { - set_pi_futex_owner(hb, &q.key, curr); + /* + * We took the lock due to requeue or owner died take over. + */ + if (unlikely(lock_taken)) { + /* For requeue we need to fixup the pi_futex */ + if (curval & FUTEX_WAITER_REQUEUED) + set_pi_futex_owner(hb, &q.key, curr); goto out_unlock_release_sem; } @@ -1787,34 +1813,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); if (unlikely(ret)) { - /* - * There were no waiters and the owner task lookup - * failed. When the OWNER_DIED bit is set, then we - * know that this is a robust futex and we actually - * take the lock. This is safe as we are protected by - * the hash bucket lock. We also set the waiters bit - * unconditionally here, to simplify glibc handling of - * multiple tasks racing to acquire the lock and - * cleanup the problems which were left by the dead - * owner. - */ - if (curval & FUTEX_OWNER_DIED) { - uval = newval; - newval = current->pid | - FUTEX_OWNER_DIED | FUTEX_WAITERS; + switch (ret) { - pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - pagefault_enable(); + case -EAGAIN: + /* + * Task is exiting and we just wait for the + * exit to complete. + */ + queue_unlock(&q, hb); + if (fshared) + up_read(fshared); + cond_resched(); + goto retry; - if (unlikely(curval == -EFAULT)) + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) goto uaddr_faulted; - if (unlikely(curval != uval)) + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; goto retry_locked; - ret = 0; + } + default: + goto out_unlock_release_sem; } - goto out_unlock_release_sem; } /* @@ -1845,31 +1877,42 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, down_read(fshared); spin_lock(q.lock_ptr); - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ - if (!ret && q.pi_state->owner != curr) - /* mmap_sem is unlocked at return of this function */ - ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); - else { + if (!ret) { + /* + * Got the lock. We might not be the anticipated owner + * if we did a lock-steal - fix up the PI-state in + * that case: + */ + if (q.pi_state->owner != curr) + ret = fixup_pi_state_owner(uaddr, &q, curr); + } else { /* * Catch the rare case, where the lock was released - * when we were on the way back before we locked - * the hash bucket. + * when we were on the way back before we locked the + * hash bucket. */ - if (ret && q.pi_state->owner == curr) { - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; + if (q.pi_state->owner == curr && + rt_mutex_trylock(&q.pi_state->pi_mutex)) { + ret = 0; + } else { + /* + * Paranoia check. If we did not take the lock + * in the trylock above, then we should not be + * the owner of the rtmutex, neither the real + * nor the pending one: + */ + if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) + printk(KERN_ERR "futex_lock_pi: ret = %d " + "pi-mutex: %p pi-state %p\n", ret, + q.pi_state->pi_mutex.owner, + q.pi_state->owner); } - /* Unqueue and drop the lock */ - unqueue_me_pi(&q); - if (fshared) - up_read(fshared); } - if (!detect && ret == -EDEADLK && 0) - force_sig(SIGKILL, current); + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + if (fshared) + up_read(fshared); return ret != -EINTR ? ret : -ERESTARTNOINTR; @@ -1887,16 +1930,19 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. :-) --ANK */ + queue_unlock(&q, hb); + if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt); if (ret) - goto out_unlock_release_sem; - goto retry_locked; + goto out_release_sem; + goto retry_unlocked; } - queue_unlock(&q, hb); if (fshared) up_read(fshared); @@ -1940,9 +1986,9 @@ retry: goto out; hb = hash_futex(&key); +retry_unlocked: spin_lock(&hb->lock); -retry_locked: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking @@ -2005,16 +2051,19 @@ pi_faulted: * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while * still holding the mmap_sem. + * + * ... and hb->lock. --ANK */ + spin_unlock(&hb->lock); + if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt); if (ret) - goto out_unlock; - goto retry_locked; + goto out; + goto retry_unlocked; } - spin_unlock(&hb->lock); if (fshared) up_read(fshared); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 338a9b489fbc..27478948b318 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -144,20 +144,21 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec ts; ktime_t t, *tp = NULL; int val2 = 0; + int cmd = op & FUTEX_CMD_MASK; - if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) return -EINVAL; t = timespec_to_ktime(ts); - if (op == FUTEX_WAIT) + if (cmd == FUTEX_WAIT) t = ktime_add(ktime_get(), t); tp = &t; } - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE - || op == FUTEX_CMP_REQUEUE_PI) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE + || cmd == FUTEX_CMP_REQUEUE_PI) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b0d81aae472f..bd9e272d55e9 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -135,6 +135,39 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } } +static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +{ + struct irqaction *action; + + if (!irqfixup) + return 0; + + /* We didn't actually handle the IRQ - see if it was misrouted? */ + if (action_ret == IRQ_NONE) + return 1; + + /* + * But for 'irqfixup == 2' we also do it for handled interrupts if + * they are marked as IRQF_IRQPOLL (or for irq zero, which is the + * traditional PC timer interrupt.. Legacy) + */ + if (irqfixup < 2) + return 0; + + if (!irq) + return 1; + + /* + * Since we don't get the descriptor lock, "action" can + * change under us. We don't really care, but we don't + * want to follow a NULL pointer. So tell the compiler to + * just load it once by using a barrier. + */ + action = desc->action; + barrier(); + return action && (action->flags & IRQF_IRQPOLL); +} + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -144,15 +177,10 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, report_bad_irq(irq, desc, action_ret); } - if (unlikely(irqfixup)) { - /* Don't punish working computers */ - if ((irqfixup == 2 && ((irq == 0) || - (desc->action->flags & IRQF_IRQPOLL))) || - action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq); - if (action_ret == IRQ_NONE) - desc->irqs_unhandled -= ok; - } + if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { + int ok = misrouted_irq(irq); + if (action_ret == IRQ_NONE) + desc->irqs_unhandled -= ok; } desc->irq_count++; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f1bda23140b2..fed54418626c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -257,7 +257,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); - *modname = NULL; + if (modname) + *modname = NULL; return namebuf; } diff --git a/kernel/kthread.c b/kernel/kthread.c index df8a8e8f6ca4..bbd51b81a3e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -70,7 +70,7 @@ static int kthread(void *_create) data = create->data; /* OK, tell user we're spawned, wait for stop or wakeup */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); complete(&create->started); schedule(); @@ -162,7 +162,10 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - BUG_ON(k->state != TASK_INTERRUPTIBLE); + if (k->state != TASK_UNINTERRUPTIBLE) { + WARN_ON(1); + return; + } /* Must have done schedule() in kthread() before we set_task_cpu */ wait_task_inactive(k); set_task_cpu(k, cpu); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b5f0543ed84d..f445b9cd60fb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -416,7 +416,8 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) mutex_lock(&pm_mutex); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { - if (!strncmp(buf, hibernation_modes[i], len)) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { mode = i; break; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 40d56a31245e..8812985f3029 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -97,25 +97,26 @@ static int suspend_prepare(suspend_state_t state) } } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Thaw; - } - suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Some devices failed to suspend\n"); - goto Resume_devices; + goto Resume_console; } + if (pm_ops->prepare) { + if ((error = pm_ops->prepare(state))) + goto Resume_devices; + } + error = disable_nonboot_cpus(); if (!error) return 0; enable_nonboot_cpus(); - Resume_devices: pm_finish(state); + Resume_devices: device_resume(); + Resume_console: resume_console(); Thaw: thaw_processes(); @@ -289,13 +290,13 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) len = p ? p - buf : n; /* First, check if we are requested to hibernate */ - if (!strncmp(buf, "disk", len)) { + if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); return error ? error : n; } for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } if (state < PM_SUSPEND_MAX && *s) diff --git a/kernel/power/process.c b/kernel/power/process.c index 088419387388..e0233d8422b9 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -31,16 +31,36 @@ static inline int freezeable(struct task_struct * p) return 1; } +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_tsk_thread_flag(current, TIF_FREEZE); +} + /* Refrigerator is place where frozen processes are stored :-). */ void refrigerator(void) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - frozen_process(current); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); @@ -81,7 +101,7 @@ static void cancel_freezing(struct task_struct *p) pr_debug(" clean up: %s\n", p->comm); do_not_freeze(p); spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); + recalc_sigpending_and_wake(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } } @@ -112,22 +132,12 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) cancel_freezing(p); continue; } - if (is_user_space(p)) { - if (!freeze_user_space) - continue; - - /* Freeze the task unless there is a vfork - * completion pending - */ - if (!p->vfork_done) - freeze_process(p); - } else { - if (freeze_user_space) - continue; - - freeze_process(p); - } - todo++; + if (freeze_user_space && !is_user_space(p)) + continue; + + freeze_process(p); + if (!freezer_should_skip(p)) + todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ @@ -149,13 +159,16 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) TIMEOUT / HZ, todo); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (is_user_space(p) == !freeze_user_space) + if (freeze_user_space && !is_user_space(p)) continue; - if (freezeable(p) && !frozen(p)) + task_lock(p); + if (freezeable(p) && !frozen(p) && + !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); cancel_freezing(p); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -200,9 +213,7 @@ static void thaw_tasks(int thaw_user_space) if (is_user_space(p) == !thaw_user_space) continue; - if (!thaw_process(p)) - printk(KERN_WARNING " Strange, %s not stopped\n", - p->comm ); + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b8b235cc19d1..8b1a1b837145 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -584,7 +584,7 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, sizeof(PAGE_SIZE)); + memset(swsusp_header, 0, PAGE_SIZE); error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) diff --git a/kernel/profile.c b/kernel/profile.c index cc91b9bf759d..5b20fe977bed 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -26,6 +26,7 @@ #include <asm/sections.h> #include <asm/semaphore.h> #include <asm/irq_regs.h> +#include <asm/ptrace.h> struct profile_hit { u32 pc, hits; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 12879f6c1ec3..a6fbb4130521 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -189,6 +189,19 @@ int rt_mutex_adjust_prio_chain(struct task_struct *task, if (!waiter || !waiter->task) goto out_unlock_pi; + /* + * Check the orig_waiter state. After we dropped the locks, + * the previous owner of the lock might have released the lock + * and made us the pending owner: + */ + if (orig_waiter && !orig_waiter->task) + goto out_unlock_pi; + + /* + * Drop out, when the task has no waiters. Note, + * top_waiter can be NULL, when we are in the deboosting + * mode! + */ if (top_waiter && (!task_has_pi_waiters(task) || top_waiter != task_top_pi_waiter(task))) goto out_unlock_pi; @@ -636,9 +649,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) + if (unlikely(!waiter.task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the + * owner released the lock while we + * were walking the pi chain. + */ + ret = 0; continue; - + } if (unlikely(ret)) break; } diff --git a/kernel/sched.c b/kernel/sched.c index 799d23b4e35d..13cdab3b4c48 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4775,9 +4775,7 @@ int __sched cond_resched_softirq(void) BUG_ON(!in_softirq()); if (need_resched() && system_state == SYSTEM_RUNNING) { - raw_local_irq_disable(); - _local_bh_enable(); - raw_local_irq_enable(); + local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; diff --git a/kernel/signal.c b/kernel/signal.c index 364fc95bf97c..fe590e00db8d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -96,20 +96,38 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -fastcall void recalc_sigpending_tsk(struct task_struct *t) +static int recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || (freezing(t)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) + PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - else - clear_tsk_thread_flag(t, TIF_SIGPENDING); + return 1; + } + /* + * We must never clear the flag in another thread, or in current + * when it's possible the current syscall is returning -ERESTART*. + * So we don't clear it here, and only callers who know they should do. + */ + return 0; +} + +/* + * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. + * This is superfluous when called on current, the wakeup is a harmless no-op. + */ +void recalc_sigpending_and_wake(struct task_struct *t) +{ + if (recalc_sigpending_tsk(t)) + signal_wake_up(t, 0); } void recalc_sigpending(void) { - recalc_sigpending_tsk(current); + if (!recalc_sigpending_tsk(current)) + clear_thread_flag(TIF_SIGPENDING); + } /* Given the mask, find the first available signal that should be serviced. */ @@ -373,7 +391,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } - recalc_sigpending_tsk(tsk); + if (likely(tsk == current)) + recalc_sigpending(); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -744,7 +763,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) action->sa.sa_handler = SIG_DFL; if (blocked) { sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); + recalc_sigpending_and_wake(t); } } ret = specific_send_sig_info(sig, info, t); @@ -1568,8 +1587,9 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. + * This sets TIF_SIGPENDING, but never clears it. */ - recalc_sigpending(); + recalc_sigpending_tsk(current); } void ptrace_notify(int exit_code) @@ -2273,7 +2293,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) rm_from_queue_full(&mask, &t->signal->shared_pending); do { rm_from_queue_full(&mask, &t->pending); - recalc_sigpending_tsk(t); + recalc_sigpending_and_wake(t); t = next_thread(t); } while (t != current); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4073353abd4f..30ee462ee79f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -227,7 +227,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 128, + .maxlen = CORENAME_MAX_SIZE, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3db5c3c460d7..51b6a6a6158c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -74,7 +74,7 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static int watchdog_resumed; +static unsigned long watchdog_resumed; /* * Interval: 0.5sec Threshold: 0.0625s @@ -104,9 +104,7 @@ static void clocksource_watchdog(unsigned long data) spin_lock(&watchdog_lock); - resumed = watchdog_resumed; - if (unlikely(resumed)) - watchdog_resumed = 0; + resumed = test_and_clear_bit(0, &watchdog_resumed); wdnow = watchdog->read(); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); @@ -151,9 +149,7 @@ static void clocksource_watchdog(unsigned long data) } static void clocksource_resume_watchdog(void) { - spin_lock(&watchdog_lock); - watchdog_resumed = 1; - spin_unlock(&watchdog_lock); + set_bit(0, &watchdog_resumed); } static void clocksource_check_watchdog(struct clocksource *cs) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cb25649c6f50..87aa5ff931e0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -11,6 +11,8 @@ #include <linux/mm.h> #include <linux/time.h> #include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/hrtimer.h> #include <asm/div64.h> #include <asm/timex.h> diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index eadfce2fff74..8001d37071f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -243,11 +243,18 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu) { int cpu = get_cpu(); - if (cpu == *oncpu) - tick_do_broadcast_on_off(&reason); - else - smp_call_function_single(*oncpu, tick_do_broadcast_on_off, - &reason, 1, 1); + if (!cpu_isset(*oncpu, cpu_online_map)) { + printk(KERN_ERR "tick-braodcast: ignoring broadcast for " + "offline CPU #%d\n", *oncpu); + } else { + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, + tick_do_broadcast_on_off, + &reason, 1, 1); + } put_cpu(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3483e6cb9549..52db9e3c526e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -167,9 +167,15 @@ void tick_nohz_stop_sched_tick(void) goto end; cpu = smp_processor_id(); - if (unlikely(local_softirq_pending())) - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); + if (unlikely(local_softirq_pending())) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + ratelimit++; + } + } now = ktime_get(); /* @@ -241,6 +247,21 @@ void tick_nohz_stop_sched_tick(void) if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = -1; + ts->idle_sleeps++; + + /* + * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that + * there is no timer pending or at least extremly far + * into the future (12 days for HZ=1000). In this case + * we simply stop the tick timer: + */ + if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { + ts->idle_expires.tv64 = KTIME_MAX; + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; + } + /* * calculate the expiry time for the next timer wheel * timer @@ -248,7 +269,6 @@ void tick_nohz_stop_sched_tick(void) expires = ktime_add_ns(last_update, tick_period.tv64 * delta_jiffies); ts->idle_expires = expires; - ts->idle_sleeps++; if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f9217bf644f6..3d1042f82a68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -273,6 +273,8 @@ static int timekeeping_resume(struct sys_device *dev) unsigned long flags; unsigned long now = read_persistent_clock(); + clocksource_resume(); + write_seqlock_irqsave(&xtime_lock, flags); if (now && (now > timekeeping_suspend_time)) { diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 868f1bceb07f..321693724ad7 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -117,21 +117,6 @@ static struct entry entries[MAX_ENTRIES]; static atomic_t overflow_count; -static void reset_entries(void) -{ - nr_entries = 0; - memset(entries, 0, sizeof(entries)); - atomic_set(&overflow_count, 0); -} - -static struct entry *alloc_entry(void) -{ - if (nr_entries >= MAX_ENTRIES) - return NULL; - - return entries + nr_entries++; -} - /* * The entries are in a hash-table, for fast lookup: */ @@ -149,6 +134,22 @@ static struct entry *alloc_entry(void) static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + static int match_entries(struct entry *entry1, struct entry *entry2) { return entry1->timer == entry2->timer && @@ -202,12 +203,15 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) if (curr) { *curr = *entry; curr->count = 0; + curr->next = NULL; memcpy(curr->comm, comm, TASK_COMM_LEN); + + smp_mb(); /* Ensure that curr is initialized before insert */ + if (prev) prev->next = curr; else *head = curr; - curr->next = NULL; } out_unlock: spin_unlock(&table_lock); @@ -232,10 +236,15 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, /* * It doesnt matter which lock we take: */ - spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + spinlock_t *lock; struct entry *entry, input; unsigned long flags; + if (likely(!active)) + return; + + lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + input.timer = timer; input.start_func = startf; input.expire_func = timerf; @@ -360,6 +369,7 @@ static ssize_t tstats_write(struct file *file, const char __user *buf, if (!active) { reset_entries(); time_start = ktime_get(); + smp_mb(); active = 1; } break; diff --git a/kernel/timer.c b/kernel/timer.c index a6c580ac084b..1a69705c2fb9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -666,7 +666,7 @@ static inline void __run_timers(tvec_base_t *base) static unsigned long __next_timer_interrupt(tvec_base_t *base) { unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + (LONG_MAX >> 1); + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; int index, slot, array, found = 0; struct timer_list *nte; tvec_t *varray[4]; @@ -752,6 +752,14 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, tsdelta = ktime_to_timespec(hr_delta); delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + /* * Take rounding errors in to account and make sure, that it * expires in the next tick. Otherwise we go into an endless @@ -1499,8 +1507,6 @@ unregister_time_interpolator(struct time_interpolator *ti) prev = &curr->next; } - clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); if (ti == time_interpolator) { /* we lost the best time-interpolator: */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fb56fedd5c02..3bebf73be976 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -47,7 +47,6 @@ struct cpu_workqueue_struct { struct workqueue_struct *wq; struct task_struct *thread; - int should_stop; int run_depth; /* Detect run_workqueue() recursion depth */ } ____cacheline_aligned; @@ -71,7 +70,13 @@ static LIST_HEAD(workqueues); static int singlethread_cpu __read_mostly; static cpumask_t cpu_singlethread_map __read_mostly; -/* optimization, we could use cpu_possible_map */ +/* + * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD + * flushes cwq->worklist. This means that flush_workqueue/wait_on_work + * which comes in between can't use for_each_online_cpu(). We could + * use cpu_possible_map, the cpumask below is more a documentation + * than optimization. + */ static cpumask_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ @@ -272,24 +277,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) spin_unlock_irq(&cwq->lock); } -/* - * NOTE: the caller must not touch *cwq if this func returns true - */ -static int cwq_should_stop(struct cpu_workqueue_struct *cwq) -{ - int should_stop = cwq->should_stop; - - if (unlikely(should_stop)) { - spin_lock_irq(&cwq->lock); - should_stop = cwq->should_stop && list_empty(&cwq->worklist); - if (should_stop) - cwq->thread = NULL; - spin_unlock_irq(&cwq->lock); - } - - return should_stop; -} - static int worker_thread(void *__cwq) { struct cpu_workqueue_struct *cwq = __cwq; @@ -302,14 +289,15 @@ static int worker_thread(void *__cwq) for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && !cwq->should_stop - && list_empty(&cwq->worklist)) + if (!freezing(current) && + !kthread_should_stop() && + list_empty(&cwq->worklist)) schedule(); finish_wait(&cwq->more_work, &wait); try_to_freeze(); - if (cwq_should_stop(cwq)) + if (kthread_should_stop()) break; run_workqueue(cwq); @@ -340,18 +328,21 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, insert_work(cwq, &barr->work, tail); } -static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) { + int active; + if (cwq->thread == current) { /* * Probably keventd trying to flush its own queue. So simply run * it by hand rather than deadlocking. */ run_workqueue(cwq); + active = 1; } else { struct wq_barrier barr; - int active = 0; + active = 0; spin_lock_irq(&cwq->lock); if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { insert_wq_barrier(cwq, &barr, 1); @@ -362,6 +353,8 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) if (active) wait_for_completion(&barr.done); } + + return active; } /** @@ -674,7 +667,6 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) return PTR_ERR(p); cwq->thread = p; - cwq->should_stop = 0; return 0; } @@ -740,29 +732,27 @@ EXPORT_SYMBOL_GPL(__create_workqueue); static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { - struct wq_barrier barr; - int alive = 0; - - spin_lock_irq(&cwq->lock); - if (cwq->thread != NULL) { - insert_wq_barrier(cwq, &barr, 1); - cwq->should_stop = 1; - alive = 1; - } - spin_unlock_irq(&cwq->lock); + /* + * Our caller is either destroy_workqueue() or CPU_DEAD, + * workqueue_mutex protects cwq->thread + */ + if (cwq->thread == NULL) + return; - if (alive) { - wait_for_completion(&barr.done); + /* + * If the caller is CPU_DEAD the single flush_cpu_workqueue() + * is not enough, a concurrent flush_workqueue() can insert a + * barrier after us. + * When ->worklist becomes empty it is safe to exit because no + * more work_structs can be queued on this cwq: flush_workqueue + * checks list_empty(), and a "normal" queue_work() can't use + * a dead CPU. + */ + while (flush_cpu_workqueue(cwq)) + ; - while (unlikely(cwq->thread != NULL)) - cpu_relax(); - /* - * Wait until cwq->thread unlocks cwq->lock, - * it won't touch *cwq after that. - */ - smp_rmb(); - spin_unlock_wait(&cwq->lock); - } + kthread_stop(cwq->thread); + cwq->thread = NULL; } /** |