From dbda92d16f8655044e082930e4e9d244b87fde77 Mon Sep 17 00:00:00 2001 From: "Bu, Yitian" Date: Mon, 18 Feb 2013 12:53:37 +0000 Subject: printk: Fix rq->lock vs logbuf_lock unlock lock inversion commit 07354eb1a74d1 ("locking printk: Annotate logbuf_lock as raw") reintroduced a lock inversion problem which was fixed in commit 0b5e1c5255 ("printk: Release console_sem after logbuf_lock"). This happened probably when fixing up patch rejects. Restore the ordering and unlock logbuf_lock before releasing console_sem. Signed-off-by: ybu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/E807E903FE6CBE4D95E420FBFCC273B827413C@nasanexd01h.na.qualcomm.com Signed-off-by: Thomas Gleixner --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe8..e698e80d8428 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1358,9 +1358,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); if (wake) up(&console_sem); - raw_spin_unlock(&logbuf_lock); return retval; } -- cgit v1.2.3 From 45ceebf77653975815d82fcf7cec0a164215ae11 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:49 -0400 Subject: sched: Factor out load calculation code from sched/core.c --> sched/proc.c This large chunk of load calculation code can be easily divorced from the main core.c scheduler file, with only a couple prototypes and externs added to a kernel/sched header. Some recent commits expanded the code and the documentation of it, making it large enough to warrant separation. For example, see: 556061b, "sched/nohz: Fix rq->cpu_load[] calculations" 5aaa0b7, "sched/nohz: Fix rq->cpu_load calculations some more" 5167e8d, "sched/nohz: Rewrite and fix load-avg computation -- again" More importantly, it helps reduce the size of the main sched/core.c by yet another significant amount (~600 lines). Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-2-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/core.c | 569 ------------------------------------------------- kernel/sched/proc.c | 578 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 8 + 4 files changed, 587 insertions(+), 570 deletions(-) create mode 100644 kernel/sched/proc.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..bfa7e77e0b50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2056,575 +2056,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); - - calc_load_account_active(this_rq); -} - #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..bb3a6a0b8623 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,578 @@ +/* + * kernel/sched/proc.c + * + * Kernel load calculations, forked from sched/core.c + */ + +#include + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); + + calc_load_account_active(this_rq); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..a38ee0a0650e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,8 +10,16 @@ #include "cpupri.h" #include "cpuacct.h" +struct rq; + extern __read_mostly int scheduler_running; +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -- cgit v1.2.3 From 8527632dc95472adb571701e852479531c0567a2 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Apr 2013 15:10:50 -0400 Subject: sched: Move update_load_*() methods from sched.h to fair.c These inlines are only used by kernel/sched/fair.c so they do not need to be present in the main kernel/sched/sched.h file. Signed-off-by: Paul Gortmaker Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1366398650-31599-3-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/sched.h | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..08a554dd3e90 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a38ee0a0650e..f1f6256c1224 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -892,24 +892,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that -- cgit v1.2.3 From 424c93fe4cbe719e7fd7169248d2b648c493b68d Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 9 May 2013 11:24:03 -0500 Subject: sched: Use this_rq() helper It is a few instructions more efficent to and slightly more readable to use this_rq()-> instead of cpu_rq(smp_processor_id())-> . Size comparison of kernel/sched/fair.o: text data bss dec hex filename 27972 122 26 28120 6dd8 fair.o.before 27956 122 26 28104 6dc8 fair.o.after Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1368116643-87971-1-git-send-email-nzimmer@sgi.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- kernel/sched/rt.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c8..f2c9c0c3406c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5418,10 +5418,9 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; @@ -5436,10 +5435,9 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4ab..7aced2e3b085 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -472,7 +472,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_rq(smp_processor_id())->rd->span; + return this_rq()->rd->span; } #else static inline const struct cpumask *sched_rt_period_mask(void) -- cgit v1.2.3 From 1b1d2fb4444231f25ddabc598aa2b5a9c0833fba Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:08 +0000 Subject: lockdep: remove task argument from debug_check_no_locks_held The only existing caller to debug_check_no_locks_held calls it with 'current' as the task, and the freezer needs to call debug_check_no_locks_held but doesn't already have a current task pointer, so remove the argument. It is already assuming that the current task is relevant by dumping the current stack trace as part of the warning. This was originally part of 6aa9707099c (lockdep: check that no locks held at freeze time) which was reverted in dbf520a9d7d4. Original-author: Mandeep Singh Baines Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..e59756275000 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd5..e16c45b9ee77 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3 From 18ad0c6297df1d671ecea83b608cd9e432642a05 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:10 +0000 Subject: freezer: shorten freezer sleep time using exponential backoff All tasks can easily be frozen in under 10 ms, switch to using an initial 1 ms sleep followed by exponential backoff until 8 ms. Also convert the printed time to ms instead of centiseconds. Acked-by: Pavel Machek Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e8..fc0df8486449 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only) unsigned int todo; bool wq_busy = false; struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; do_gettimeofday(&start); @@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the refrigerator. + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. */ - msleep(10); + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; } do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; if (todo) { printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", - elapsed_csecs / 100, elapsed_csecs % 100, + elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (!wakeup) { @@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); } return todo ? -EBUSY : 0; -- cgit v1.2.3 From 613f5d13b569859171f0896fbc73ee0bfa811fda Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:11 +0000 Subject: freezer: skip waking up tasks with PF_FREEZER_SKIP set Android goes through suspend/resume very often (every few seconds when on a busy wifi network with the screen off), and a significant portion of the energy used to go in and out of suspend is spent in the freezer. If a task has called freezer_do_not_count(), don't bother waking it up. If it happens to wake up later it will call freezer_count() and immediately enter the refrigerator. Combined with patches to convert freezable helpers to use freezer_do_not_count() and convert common sites where idle userspace tasks are blocked to use the freezable helpers, this reduces the time and energy required to suspend and resume. Acked-by: Tejun Heo Acked-by: Pavel Machek Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efba..8b2afc1c9df0 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; + /* + * This check can race with freezer_do_not_count, but worst case that + * will result in an extra wakeup being sent to the task. It does not + * race with freezer_count(), the barriers in freezer_count() and + * freezer_should_skip() ensure that either freezer_count() sees + * freezing == true in try_to_freeze() and freezes, or + * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task + * normally. + */ + if (freezer_should_skip(p)) + return false; + spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p)) { spin_unlock_irqrestore(&freezer_lock, flags); -- cgit v1.2.3 From 56467c7697f5aef6974501fbe2c3e63674583549 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:18 +0000 Subject: futex: use freezable blocking call Avoid waking up every thread sleeping in a futex_wait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Acked-by: Darren Hart Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c94..d710fae8abbe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -1807,7 +1808,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } -- cgit v1.2.3 From b0f8c44f30e58c3aaaaaf864d5c3d3cc2e8a4c2d Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:19 +0000 Subject: nanosleep: use freezable blocking call Avoid waking up every thread sleeping in a nanosleep call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Acked-by: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/hrtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..3ee4d06c6fc2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod t->task = NULL; if (likely(t->task)) - schedule(); + freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; -- cgit v1.2.3 From a2d5f1f5d941593e61071dc78e9de228eda5475f Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Mon, 6 May 2013 23:50:20 +0000 Subject: sigtimedwait: use freezable blocking call Avoid waking up every thread sleeping in a sigtimedwait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Acked-by: Tejun Heo Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b1..50e41075ac77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = schedule_timeout_interruptible(timeout); + timeout = freezable_schedule_timeout_interruptible(timeout); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); -- cgit v1.2.3 From cee22a15052faa817e3ec8985a28154d3fabc7aa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 8 Apr 2013 16:45:40 +0530 Subject: workqueues: Introduce new flag WQ_POWER_EFFICIENT for power oriented workqueues Workqueues can be performance or power-oriented. Currently, most workqueues are bound to the CPU they were created on. This gives good performance (due to cache effects) at the cost of potentially waking up otherwise idle cores (Idle from scheduler's perspective. Which may or may not be physically idle) just to process some work. To save power, we can allow the work to be rescheduled on a core that is already awake. Workqueues created with the WQ_UNBOUND flag will allow some power savings. However, we don't change the default behaviour of the system. To enable power-saving behaviour, a new config option CONFIG_WQ_POWER_EFFICIENT needs to be turned on. This option can also be overridden by the workqueue.power_efficient boot parameter. tj: Updated config description and comments. Renamed CONFIG_WQ_POWER_EFFICIENT to CONFIG_WQ_POWER_EFFICIENT_DEFAULT. Signed-off-by: Viresh Kumar Reviewed-by: Amit Kucheria Signed-off-by: Tejun Heo --- kernel/power/Kconfig | 20 ++++++++++++++++++++ kernel/workqueue.c | 13 +++++++++++++ 2 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..46455961a88f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS bool depends on PM +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4aa9f5bc6b2d..8068d97ce141 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -4085,6 +4094,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ + if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) + flags |= WQ_UNBOUND; + /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); -- cgit v1.2.3 From 0668106ca3865ba945e155097fb042bf66d364d3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 24 Apr 2013 17:12:54 +0530 Subject: workqueue: Add system wide power_efficient workqueues This patch adds system wide workqueues aligned towards power saving. This is done by allocating them with WQ_UNBOUND flag if 'wq_power_efficient' is set to 'true'. tj: updated comments a bit. Signed-off-by: Viresh Kumar Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8068d97ce141..16ca2d3dd29f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -314,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -4987,8 +4991,15 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_power_efficient_wq = alloc_workqueue("events_power_efficient", + WQ_POWER_EFFICIENT, 0); + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + WQ_FREEZABLE | WQ_POWER_EFFICIENT, + 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_power_efficient_wq || + !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3 From fa3ca07e96185aa1496b405472399a2a2a336a17 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:56 -0700 Subject: cgroup: refactor hierarchy_id handling We're planning to converting hierarchy_ida to an idr and use it to look up hierarchy from its id. As we want the mapping to happen atomically with cgroupfs_root registration, this patch refactors hierarchy_id init / exit so that ida operations happen inside cgroup_[root_]mutex. * s/init_root_id()/cgroup_init_root_id()/ and make it return 0 or -errno like a normal function. * Move hierarchy_id initialization from cgroup_root_from_opts() into cgroup_mount() block where the root is confirmed to be used and being registered while holding both mutexes. * Split cgroup_drop_id() into cgroup_exit_root_id() and cgroup_free_root(), so that ID release can happen before dropping the mutexes in cgroup_kill_sb(). The latter expects hierarchy_id to be exited before being invoked. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f80..dbc84f7d23b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1426,13 +1426,13 @@ static void init_cgroup_root(struct cgroupfs_root *root) list_add_tail(&cgrp->allcg_node, &root->allcg_list); } -static bool init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret = 0; + int ret; do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; + return -ENOMEM; spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, @@ -1448,7 +1448,18 @@ static bool init_root_id(struct cgroupfs_root *root) } spin_unlock(&hierarchy_id_lock); } while (ret); - return true; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + if (root->hierarchy_id) { + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + + root->hierarchy_id = 0; + } } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1482,10 +1493,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) if (!root) return ERR_PTR(-ENOMEM); - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } init_cgroup_root(root); root->subsys_mask = opts->subsys_mask; @@ -1500,17 +1507,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_drop_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroupfs_root *root) { - if (!root) - return; + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - ida_destroy(&root->cgroup_ida); - kfree(root); + ida_destroy(&root->cgroup_ida); + kfree(root); + } } static int cgroup_set_super(struct super_block *sb, void *data) @@ -1597,7 +1602,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); goto drop_modules; } @@ -1641,6 +1646,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + ret = cgroup_init_root_id(root); + if (ret) + goto unlock_drop; + ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { free_cg_links(&tmp_cg_links); @@ -1684,7 +1693,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * We re-used an existing hierarchy - the new root (if * any) is not needed */ - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && root->flags != opts.flags) { @@ -1702,6 +1711,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return dget(sb->s_root); unlock_drop: + cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1754,13 +1764,15 @@ static void cgroup_kill_sb(struct super_block *sb) { root_count--; } + cgroup_exit_root_id(root); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); simple_xattrs_free(&cgrp->xattrs); kill_litter_super(sb); - cgroup_drop_root(root); + cgroup_free_root(root); } static struct file_system_type cgroup_fs_type = { @@ -4642,7 +4654,9 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(!init_root_id(&rootnode)); + + /* allocate id for the dummy hierarchy */ + BUG_ON(cgroup_init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { -- cgit v1.2.3 From 54e7b4eb15fc4354d5ada5469e3db4a220ddb3ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:57 -0700 Subject: cgroup: drop hierarchy_id_lock Now that hierarchy_id alloc / free are protected by the cgroup mutexes, there's no need for this separate lock. Drop it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dbc84f7d23b8..3ef677d314bc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -189,9 +189,13 @@ struct cgroup_event { static LIST_HEAD(roots); static int root_count; +/* + * Hierarchy ID allocation and mapping. It follows the same exclusion + * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for + * writes, either for reads. + */ static DEFINE_IDA(hierarchy_ida); static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1430,10 +1434,12 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) { int ret; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + do { if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock(&hierarchy_id_lock); /* Try to allocate the next unused ID */ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, &root->hierarchy_id); @@ -1446,18 +1452,17 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) /* Can only get here if the 31-bit IDR is full ... */ BUG_ON(ret); } - spin_unlock(&hierarchy_id_lock); } while (ret); return 0; } static void cgroup_exit_root_id(struct cgroupfs_root *root) { + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + if (root->hierarchy_id) { - spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - root->hierarchy_id = 0; } } @@ -4656,8 +4661,14 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, key); /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); + BUG_ON(cgroup_init_root_id(&rootnode)); + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { err = -ENOMEM; -- cgit v1.2.3 From 1a574231669f8c3065c83974e9557fcbbd94b8a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 11:36:58 -0700 Subject: cgroup: make hierarchy_id use cyclic idr We want to be able to lookup a hierarchy from its id and cyclic allocation is a whole lot simpler with idr. Convert to idr and use idr_alloc_cyclc(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ef677d314bc..dcb417c6c242 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -194,8 +194,7 @@ static int root_count; * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for * writes, either for reads. */ -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; +static DEFINE_IDR(cgroup_hierarchy_idr); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -1432,27 +1431,16 @@ static void init_cgroup_root(struct cgroupfs_root *root) static int cgroup_init_root_id(struct cgroupfs_root *root) { - int ret; + int id; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_root_mutex); - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return -ENOMEM; - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - } while (ret); + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 2, 0, GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; return 0; } @@ -1462,7 +1450,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root) lockdep_assert_held(&cgroup_root_mutex); if (root->hierarchy_id) { - ida_remove(&hierarchy_ida, root->hierarchy_id); + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); root->hierarchy_id = 0; } } -- cgit v1.2.3 From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 20:50:08 -0700 Subject: cgroup: implement task_cgroup_path_from_hierarchy() kdbus folks want a sane way to determine the cgroup path that a given task belongs to on a given hierarchy, which is a reasonble thing to expect from cgroup core. Implement task_cgroup_path_from_hierarchy(). v2: Dropped unnecessary NULL check on the return value of task_cgroup_from_root() as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Li Zefan Cc: Kay Sievers Cc: Lennart Poettering Cc: Daniel Mack --- kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb417c6c242..6b2b1d945df2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1827,6 +1827,38 @@ out: } EXPORT_SYMBOL_GPL(cgroup_path); +/** + * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * @task: target task + * @hierarchy_id: the hierarchy to look up @task's cgroup from + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and + * copy its path into @buf. This function grabs cgroup_mutex and shouldn't + * be used inside locks used by cgroup controller callbacks. + */ +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen) +{ + struct cgroupfs_root *root; + struct cgroup *cgrp = NULL; + int ret = -ENOENT; + + mutex_lock(&cgroup_mutex); + + root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); + if (root) { + cgrp = task_cgroup_from_root(task, root); + ret = cgroup_path(cgrp, buf, buflen); + } + + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); + /* * Control Group taskset */ -- cgit v1.2.3 From cb65537ee1134d3cc55c1fa83952bc8eb1212833 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 May 2013 19:50:26 +0100 Subject: Add wait_on_atomic_t() and wake_up_atomic_t() Add wait_on_atomic_t() and wake_up_atomic_t() to indicate became-zero events on atomic_t types. This uses the bit-wake waitqueue table. The key is set to a value outside of the number of bits in a long so that wait_on_bit() won't be woken up accidentally. What I'm using this for is: in a following patch I add a counter to struct fscache_cookie to count the number of outstanding operations that need access to netfs data. The way this works is: (1) When a cookie is allocated, the counter is initialised to 1. (2) When an operation wants to access netfs data, it calls atomic_inc_unless() to increment the counter before it does so. If it was 0, then the counter isn't incremented, the operation isn't permitted to access the netfs data (which might by this point no longer exist) and the operation aborts in some appropriate manner. (3) When an operation finishes with the netfs data, it decrements the counter and if it reaches 0, calls wake_up_atomic_t() on it - the assumption being that it was the last blocker. (4) When a cookie is released, the counter is decremented and the releaser uses wait_on_atomic_t() to wait for the counter to become 0 - which should indicate no one is using the netfs data any longer. The netfs data can then be destroyed. There are some alternatives that I have thought of and that have been suggested by Tejun Heo: (A) Using wait_on_bit() to wait on a bit in the counter. This doesn't work because if that bit happens to be 0 then the wait won't happen - even if the counter is non-zero. (B) Using wait_on_bit() to wait on a flag elsewhere which is cleared when the counter reaches 0. Such a flag would be redundant and would add complexity. (C) Adding a waitqueue to fscache_cookie - this would expand that struct by several words for an event that happens just once in each cookie's lifetime. Further, cookies are generally per-file so there are likely to be a lot of them. (D) Similar to (C), but add a pointer to a waitqueue in the cookie instead of a waitqueue. This would add single word per cookie and so would be less of an expansion - but still an expansion. (E) Adding a static waitqueue to the fscache module. Generally this would be fine, but under certain circumstances many cookies will all get added at the same time (eg. NFS umount, cache withdrawal) thereby presenting scaling issues. Note that the wait may be significant as disk I/O may be in progress. So, I think reusing the wait_on_bit() waitqueue set is reasonable. I don't make much use of the waitqueue I need on a per-cookie basis, but sometimes I have a huge flood of the cookies to deal with. I also don't want to add a whole new set of global waitqueue tables specifically for the dec-to-0 event if I can reuse the bit tables. Signed-off-by: David Howells Tested-By: Milosz Tanski Acked-by: Jeff Layton --- kernel/wait.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ead..ce0daa320a26 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; } EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); -- cgit v1.2.3 From 5d33b883aed81c6fbcd09c6f7c3619eee850a7e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:43 +0000 Subject: clocksource: Always verify highres capability If a clocksource has a (wrong) high rating, but can't be used as a timebase for oneshot tick mode, it is unconditionally selected even when the system is already in oneshot tick mode. This causes full system failure. Verify the clocksource selection against the oneshot mode. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.635040849@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..dda5c7130d93 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,6 +553,26 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET +static struct clocksource *clocksource_find_best(bool oneshot) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + /** * clocksource_select - Select the best clocksource available * @@ -563,12 +583,14 @@ static u64 clocksource_max_deferment(struct clocksource *cs) */ static void clocksource_select(void) { + bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; - if (!finished_booting || list_empty(&clocksource_list)) + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (strcmp(cs->name, override_name) != 0) @@ -578,8 +600,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " -- cgit v1.2.3 From ba919d1caa2e624eb8c6cae1f2ce0a253e697d45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Let timekeeping_notify return success/error timekeeping_notify() can fail due cs->enable() failure. Though the caller does not notice and happily keeps the wrong clocksource as the current one. Let the caller know about failure, so the current clocksource will be shown correctly in sysfs. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.696321912@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 6 +++--- kernel/time/timekeeping.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index dda5c7130d93..1923a340bd91 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -611,10 +611,10 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..da6e10c7a378 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -648,14 +648,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** -- cgit v1.2.3 From 09ac369c825d9d593404306d59062d854b321e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:44 +0000 Subject: clocksource: Add module refcount Add a module refcount, so the current clocksource cannot be removed unconditionally. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.762417789@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index da6e10c7a378..933efa4071c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -627,11 +627,20 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } timekeeping_update(tk, true, true); -- cgit v1.2.3 From f5a2e34375a5e2b711aea488ac3ae50eeba6d57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Allow clocksource select to skip current clocksource Preparatory patch for clocksource unbind support. Split out code from clocksource_select and modify it, so it skips the current clocksource on request and tries to find a fallback clocksource. Convert all existing users. No functional change. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.834965397@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1923a340bd91..9782997cb6cf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -553,7 +553,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -static struct clocksource *clocksource_find_best(bool oneshot) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -566,6 +566,8 @@ static struct clocksource *clocksource_find_best(bool oneshot) * the best rating. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; return cs; @@ -573,26 +575,20 @@ static struct clocksource *clocksource_find_best(bool oneshot) return NULL; } -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static void __clocksource_select(bool skipcur) { bool oneshot = tick_oneshot_mode_active(); struct clocksource *best, *cs; /* Find the best suitable clocksource */ - best = clocksource_find_best(oneshot); + best = clocksource_find_best(oneshot, skipcur); if (!best) return; /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -618,6 +614,19 @@ static void clocksource_select(void) } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } -- cgit v1.2.3 From 29b5407819f59731c9423238fae03b756822708c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:45 +0000 Subject: clocksource: Split out user string input Split out the user string input for clocksource override. Preparatory patch for unbind. [ jstultz: Fix an off by one error ] Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.895851338@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9782997cb6cf..d7f1a45c2fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,8 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +#define CS_NAME_LEN 32 +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -838,6 +839,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -852,22 +870,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = clocksource_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); -- cgit v1.2.3 From 7eaeb34305dee26634f7c98ae62646da5cebe91d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Provide unbind interface in sysfs With the module refcount held for the current clocksource there is no way to unload the module. Provide a sysfs interface which allows to unbind the clocksource. One could argue that the clocksource override could be (ab)used to do so, but the clocksource override cannot be used from the kernel itself, while an unbind function can be used to programmatically check whether a clocksource can be shutdown or not. The unbind functionality uses the new skip current feature of clocksource_select and verifies that a fallback clocksource has been installed. If the clocksource which should be unbound is the current clocksource and no fallback can be found, unbind returns -EBUSY. This does not support the unbinding of a clocksource which is used as the watchdog clocksource. No point in fostering crappy hardware. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143435.964218245@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d7f1a45c2fa5..791d1aeb17ac 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -440,6 +440,11 @@ static int clocksource_watchdog_kthread(void *data) return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -451,6 +456,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -628,6 +634,11 @@ static void clocksource_select(void) return __clocksource_select(false); } +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -803,6 +814,29 @@ void clocksource_change_rating(struct clocksource *cs, int rating) } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered @@ -883,6 +917,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = clocksource_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused @@ -925,6 +993,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -948,6 +1018,9 @@ static int __init init_clocksource_sysfs(void) error = device_create_file( &device_clocksource, &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); if (!error) error = device_create_file( &device_clocksource, -- cgit v1.2.3 From a89c7edbe7d7aa80f507915f3dd801211b116b79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:46 +0000 Subject: clocksource: Let clocksource_unregister() return success/error The unregister call can fail, if the clocksource is the current one and there is no replacement clocksource available. It can also fail, if the clocksource is the watchdog clocksource and I'm not going to provide support for this. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.029915527@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 791d1aeb17ac..31b90332f47b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -389,28 +389,17 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -841,13 +830,15 @@ static int clocksource_unbind(struct clocksource *cs) * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); -- cgit v1.2.3 From 7172a286ced0c1f4f239a0fa09db54ed37d3ead2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:47 +0000 Subject: clockevents: Get rid of the notifier chain 7+ years and still a single user. Kill it. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.098520211@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 35 +++-------------------------------- kernel/time/tick-broadcast.c | 5 ++--- kernel/time/tick-common.c | 30 +++++------------------------- kernel/time/tick-internal.h | 7 ++++--- 4 files changed, 14 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..dd70b4842c62 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "tick-internal.h" @@ -23,10 +22,6 @@ /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); @@ -232,30 +227,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener - */ -int clockevents_register_notifier(struct notifier_block *nb) -{ - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); - - return ret; -} - -/* - * Notify about a clock event change. Called with clockevents_lock - * held. - */ -static void clockevents_do_notify(unsigned long reason, void *dev) -{ - raw_notifier_call_chain(&clockevents_chain, reason, dev); -} - /* * Called after a notify add to make devices available which were * released from the notifier call. @@ -269,7 +240,7 @@ static void clockevents_notify_released(void) struct clock_event_device, list); list_del(&dev->list); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); } } @@ -290,7 +261,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -433,7 +404,7 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); + tick_notify(reason, arg); switch (reason) { case CLOCK_EVT_NOTIFY_CPU_DEAD: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..3500caaa0bfd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,7 +64,7 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; @@ -72,7 +72,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) @@ -90,7 +90,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..dbf4e18d5101 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -208,11 +208,11 @@ static void tick_setup_device(struct tick_device *td, /* * Check, if the new registered device should be used. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; + int cpu; unsigned long flags; raw_spin_lock_irqsave(&tick_device_lock, flags); @@ -275,18 +275,14 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_oneshot_notify(); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - + tick_install_broadcast_device(newdev); raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; } /* @@ -360,17 +356,10 @@ static void tick_resume(void) raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) +void tick_notify(unsigned long reason, void *dev) { switch (reason) { - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -404,21 +393,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, default: break; } - - return NOTIFY_OK; } -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..60742fe6f63d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_notify(unsigned long reason, void *dev); +extern void tick_check_new_device(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); @@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) -- cgit v1.2.3 From 7126cac426137633e470167524e7bcb590fd49b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Simplify locking Now that the notifier chain is gone there are no other users and it's pointless to nest tick_device_lock inside of clockevents_lock because there is no other use case. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.162888472@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index dbf4e18d5101..170a4bdfa99e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -206,16 +205,14 @@ static void tick_setup_device(struct tick_device *td, } /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -273,8 +270,6 @@ void tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); return; out_bc: @@ -282,7 +277,6 @@ out_bc: * Can the new device be used as a broadcast device ? */ tick_install_broadcast_device(newdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* @@ -311,9 +305,7 @@ static void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -325,26 +317,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -353,9 +339,11 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } +/* + * Called with clockevents_lock held and interrupts disabled + */ void tick_notify(unsigned long reason, void *dev) { switch (reason) { -- cgit v1.2.3 From 8c53daf63f56791ed47fc585206ef3049489612f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:48 +0000 Subject: clockevents: Move the tick_notify() switch case to clockevents_notify() No need to call another function and have duplicated cases. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.235746557@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 28 ++++++++++++++++++++++++- kernel/time/tick-common.c | 50 ++++----------------------------------------- kernel/time/tick-internal.h | 5 ++++- 3 files changed, 35 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dd70b4842c62..0e3a8448e115 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -404,10 +404,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 170a4bdfa99e..84c7cfca4d7d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -284,7 +284,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -301,7 +301,7 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; @@ -319,14 +319,14 @@ static void tick_shutdown(unsigned int *cpup) } } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); clockevents_shutdown(td->evtdev); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); int broadcast = tick_resume_broadcast(); @@ -341,48 +341,6 @@ static void tick_resume(void) } } -/* - * Called with clockevents_lock held and interrupts disabled - */ -void tick_notify(unsigned long reason, void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } -} - /** * tick_init - initialize the tick control */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 60742fe6f63d..06bfc8802dfb 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -18,8 +18,11 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); -extern void tick_notify(unsigned long reason, void *dev); extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); extern void clockevents_shutdown(struct clock_event_device *dev); -- cgit v1.2.3 From ccf33d6880f39a35158fff66db13000ae4943fac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Add module refcount We want to be able to remove clockevent modules as well. Add a refcount so we don't remove a module with an active clock event device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.307435149@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 1 + kernel/time/tick-broadcast.c | 3 +++ kernel/time/tick-common.c | 4 ++++ 3 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0e3a8448e115..89e394caa769 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -357,6 +357,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3500caaa0bfd..0e374cd2e0ef 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -73,6 +74,8 @@ void tick_install_broadcast_device(struct clock_event_device *dev) tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return; + if (!try_module_get(dev->owner)) + return; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); if (cur) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 84c7cfca4d7d..433a1e11d13b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -257,6 +258,9 @@ void tick_check_new_device(struct clock_event_device *newdev) goto out_bc; } + if (!try_module_get(newdev->owner)) + return; + /* * Replace the eventually existing device by the new * device. If the current device is the broadcast device, do -- cgit v1.2.3 From 501f867064e95f9a6f540e60705be0937280e7ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:49 +0000 Subject: clockevents: Provide sysfs interface Provide a simple sysfs interface for the clockevent devices. Show the current active clockevent device. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.371634778@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 89e394caa769..0a23f4f29934 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "tick-internal.h" @@ -460,4 +461,89 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ -- cgit v1.2.3 From 45cb8e01b2ecef1c2afb18333e95793fa1a90281 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Split out selection logic Split out the clockevent device selection logic. Preparatory patch to allow unbinding active clockevent devices. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.431796247@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 25 ++++++++++++---- kernel/time/tick-common.c | 69 +++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0e374cd2e0ef..d067c01586f5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -65,19 +65,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!tick_check_broadcast_device(cur, dev)) return; + if (!try_module_get(dev->owner)) return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 433a1e11d13b..c34021650348 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,37 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* Use the higher rated one */ + return !curdev || newdev->rating > curdev->rating; +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. @@ -223,40 +254,12 @@ void tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; - - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; if (!try_module_get(newdev->owner)) return; -- cgit v1.2.3 From 03e13cf5ee60584fe0c831682c67212effb7fca4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2013 20:31:50 +0000 Subject: clockevents: Implement unbind functionality Provide a sysfs interface to allow unbinding of clockevent devices. The device is unbound if it is unused or if there is a replacement device available. Unbinding of broadcast devices is not supported as we don't want to foster that nonsense. If no replacement device is available the unbind returns -EBUSY. Unbind is available from the kernel and through sysfs, which is necessary to drop the module refcount. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130425143436.499216659@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/clocksource.c | 9 ++-- kernel/time/tick-common.c | 24 +++++++++ kernel/time/tick-internal.h | 7 +++ 4 files changed, 161 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0a23f4f29934..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -25,6 +25,13 @@ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -245,6 +252,90 @@ static void clockevents_notify_released(void) } } +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -487,6 +578,38 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, } static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", @@ -529,6 +652,8 @@ static int __init tick_init_sysfs(void) err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 31b90332f47b..6d05b00410cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include #include +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,7 +176,6 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -#define CS_NAME_LEN 32 static char override_name[CS_NAME_LEN]; static int finished_booting; @@ -864,7 +865,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -static size_t clocksource_get_uname(const char *buf, char *dst, size_t cnt) +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -899,7 +900,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, mutex_lock(&clocksource_mutex); - ret = clocksource_get_uname(buf, override_name, count); + ret = sysfs_get_uname(buf, override_name, count); if (ret >= 0) clocksource_select(); @@ -925,7 +926,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, char name[CS_NAME_LEN]; size_t ret; - ret = clocksource_get_uname(buf, name, count); + ret = sysfs_get_uname(buf, name, count); if (ret < 0) return ret; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c34021650348..5edfb4806032 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -205,6 +205,17 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + static bool tick_check_percpu(struct clock_event_device *curdev, struct clock_event_device *newdev, int cpu) { @@ -236,6 +247,19 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* * Check, if the new registered device should be used. Called with * clockevents_lock held and interrupts disabled. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 06bfc8802dfb..be1690eaecff 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -11,6 +11,8 @@ extern seqlock_t jiffies_lock; #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 +#define CS_NAME_LEN 32 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; @@ -23,9 +25,14 @@ extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: make cgroup_is_removed() static cgroup_is_removed() no longer has external users and it shouldn't grow any - controllers should deal with cgroup_subsys_state on/offline state instead of cgroup removal state. Make it static. While at it, make it return bool. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a19419f4af1a..501974823b33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,7 +226,7 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_removed(const struct cgroup *cgrp) { return test_bit(CGRP_REMOVED, &cgrp->flags); } -- cgit v1.2.3 From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling() Currently, there's no easy way to find out the next sibling cgroup unless it's known that the current cgroup is accessed from the parent's children list in a single RCU critical section. This in turn forces all iterators to require whole iteration to be enclosed in a single RCU critical section, which sometimes is too restrictive. This patch implements cgroup_next_sibling() which can reliably determine the next sibling regardless of the state of the current cgroup as long as it's accessible. It currently is impossible to determine the next sibling after dropping RCU read lock because the cgroup being iterated could be removed anytime and if RCU read lock is dropped, nothing guarantess its ->sibling.next pointer is accessible. A removed cgroup would continue to point to its next sibling for RCU accesses but stop receiving updates from the sibling. IOW, the next sibling could be removed and then complete its grace period while RCU read lock is dropped, making it unsafe to dereference ->sibling.next after dropping and re-acquiring RCU read lock. This can be solved by adding a way to traverse to the next sibling without dereferencing ->sibling.next. This patch adds a monotonically increasing cgroup serial number, cgroup->serial_nr, which guarantees that all cgroup->children lists are kept in increasing serial_nr order. A new function, cgroup_next_sibling(), is implemented, which, if CGRP_REMOVED is not set on the current cgroup, follows ->sibling.next; otherwise, traverses the parent's ->children list until it sees a sibling with higher ->serial_nr. This allows the function to always return the next sibling regardless of the state of the current cgroup without adding overhead in the fast path. Further patches will update the iterators to use cgroup_next_sibling() so that they allow dropping RCU read lock and blocking while iteration is in progress which in turn will be used to simplify controllers. v2: Typo fix as per Serge. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn --- kernel/cgroup.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 501974823b33..b87c7a5a5497 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock. The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_REMOVED is set on removal which is fully + * serialized, if we see it unasserted, it's guaranteed that the + * next sibling hasn't finished its grace period even if it's + * already removed, and thus safe to dereference from this RCU + * critical section. If ->sibling.next is inaccessible, + * cgroup_is_removed() is guaranteed to be visible as %true here. + */ + if (likely(!cgroup_is_removed(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return next; + return NULL; + } + + /* + * Can't dereference the next pointer. Each cgroup is given a + * monotonically increasing unique serial number and always + * appended to the sibling list, so the next one can be found by + * walking the parent's children until we see a cgroup with higher + * serial number than @pos's. + * + * While this path can be slow, it's taken only when either the + * current cgroup is removed or iteration and removal race. + */ + list_for_each_entry_rcu(next, &pos->parent->children, sibling) + if (next->serial_nr > pos->serial_nr) + return next; + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) @@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + /* + * Assign a monotonically increasing serial number. With the list + * appending below, it guarantees that sibling cgroups are always + * sorted in the ascending serial number order on the parent's + * ->children. + */ + cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); @@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * removed. This makes future css_tryget() and child creation * attempts fail thus maintaining the removal conditions verified * above. + * + * Note that CGRP_REMVOED clearing is depended upon by + * cgroup_next_sibling() to resume iteration after dropping RCU + * read lock. See cgroup_next_sibling() for details. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.2.3 From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: update iterators to use cgroup_next_sibling() This patch converts cgroup_for_each_child(), cgroup_next_descendant_pre/post() and thus cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling() instead of manually dereferencing ->sibling.next. The only reason the iterators couldn't allow dropping RCU read lock while iteration is in progress was because they couldn't determine the next sibling safely once RCU read lock is dropped. Using cgroup_next_sibling() removes that problem and enables all iterators to allow dropping RCU read lock in the middle. Comments are updated accordingly. This makes the iterators easier to use and will simplify controllers. Note that @cgroup argument is renamed to @cgrp in cgroup_for_each_child() because it conflicts with "struct cgroup" used in the new macro body. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Reviewed-by: Michal Hocko --- kernel/cgroup.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b87c7a5a5497..fefc41c1a147 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_sibling); * * To be used by cgroup_for_each_descendant_pre(). Find the next * descendant to visit for pre-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup) @@ -3050,11 +3055,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, - sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return next; - pos = pos->parent; } @@ -3069,6 +3072,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * Return the rightmost descendant of @pos. If there's no descendant, * @pos is returned. This can be used during pre-order traversal to skip * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct rightmost descendant as long as @pos is + * accessible. */ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) { @@ -3108,6 +3116,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) * * To be used by cgroup_for_each_descendant_post(). Find the next * descendant to visit for post-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, struct cgroup *cgroup) @@ -3123,8 +3136,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return cgroup_leftmost_descendant(next); /* no sibling left, visit parent */ -- cgit v1.2.3 From 4eedb77a9cd8f2e68b31c8b9a20524a50727c16f Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Fri, 17 May 2013 10:51:33 +0200 Subject: locking: Fix copy/paste errors of "ARCH_INLINE_*_UNLOCK_BH" The Kconfig symbols ARCH_INLINE_READ_UNLOCK_IRQ, ARCH_INLINE_SPIN_UNLOCK_IRQ, and ARCH_INLINE_WRITE_UNLOCK_IRQ were added in v2.6.33, but have never actually been used. Ingo Molnar spotted that this is caused by three identical copy/paste erros. Eg, the Kconfig entry for INLINE_READ_UNLOCK_IRQ has an (optional) dependency on: ARCH_INLINE_READ_UNLOCK_BH were it apparently should depend on: ARCH_INLINE_READ_UNLOCK_IRQ instead. Likewise for the Kconfig entries for INLINE_SPIN_UNLOCK_IRQ and INLINE_WRITE_UNLOCK_IRQ. Fix these three errors. This never really caused any real problems as these symbols are set (or unset) in a group - but it's worth fixing it nevertheless. Reported-by: Ingo Molnar Signed-off-by: Paul Bolle Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1368780693.1350.228.camel@x61.thuisdomein Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100eaa..d2b32ac27a39 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y -- cgit v1.2.3 From ab573844e3058eef2788803d373019f8bebead57 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 1 May 2013 17:25:44 +0200 Subject: perf: Fix hw breakpoints overflow period sampling The hw breakpoint pmu 'add' function is missing the period_left update needed for SW events. The perf HW breakpoint events use the SW events framework to process the overflow, so it needs to be properly initialized in the PMU 'add' method. Signed-off-by: Jiri Olsa Reviewed-by: Peter Zijlstra Cc: H. Peter Anvin Cc: Oleg Nesterov Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Vince Weaver Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1367421944-19082-5-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..e0dcced282e4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4961,7 +4961,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); * sign as trigger. */ -static u64 perf_swevent_set_period(struct perf_event *event) +u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..966a241e8616 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -612,6 +612,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags) if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(bp)) { + bp->hw.last_period = bp->hw.sample_period; + perf_swevent_set_period(bp); + } + return arch_install_hw_breakpoint(bp); } -- cgit v1.2.3 From 9e6302056f8029f438e853432a856b9f13de26a6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:33 +0200 Subject: perf: Use hrtimers for event multiplexing The current scheme of using the timer tick was fine for per-thread events. However, it was causing bias issues in system-wide mode (including for uncore PMUs). Event groups would not get their fair share of runtime on the PMU. With tickless kernels, if a core is idle there is no timer tick, and thus no event rotation (multiplexing). However, there are events (especially uncore events) which do count even though cores are asleep. This patch changes the timer source for multiplexing. It introduces a per-PMU per-cpu hrtimer. The advantage is that even when a core goes idle, it will come back to service the hrtimer, thus multiplexing on system-wide events works much better. The per-PMU implementation (suggested by PeterZ) enables adjusting the multiplexing interval per PMU. The preferred interval is stashed into the struct pmu. If not set, it will be forced to the default interval value. In order to minimize the impact of the hrtimer, it is turned on and off on demand. When the PMU on a CPU is overcommited, the hrtimer is activated. It is stopped when the PMU is not overcommitted. In order for this to work properly, we had to change the order of initialization in start_kernel() such that hrtimer_init() is run before perf_event_init(). The default interval in milliseconds is set to a timer tick just like with the old code. We will provide a sysctl to tune this in another patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 106 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e0dcced282e4..97bfac7e6f45 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_rotate_context(struct perf_cpu_context *cpuctx); + int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -658,6 +660,98 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ + struct perf_cpu_context *cpuctx; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int rotations = 0; + + WARN_ON(!irqs_disabled()); + + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + + rotations = perf_rotate_context(cpuctx); + + /* + * arm timer if needed + */ + if (rotations) { + hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + ret = HRTIMER_RESTART; + } + + return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (WARN_ON(cpu != smp_processor_id())) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (pmu->task_ctx_nr == perf_sw_context) + continue; + + hrtimer_cancel(&cpuctx->hrtimer); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* no multiplexing needed for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + cpuctx->hrtimer_interval = + ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER); + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* not for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + if (hrtimer_active(hr)) + return; + + if (!hrtimer_callback_running(hr)) + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, + 0, HRTIMER_MODE_REL_PINNED, 0); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1506,6 +1600,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1552,6 +1647,8 @@ group_error: pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); + return -EAGAIN; } @@ -1807,8 +1904,10 @@ static int __perf_event_enable(void *info) * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != event) + if (leader != event) { group_sched_out(leader, cpuctx, ctx); + perf_cpu_hrtimer_restart(cpuctx); + } if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_EVENT_STATE_ERROR; @@ -2555,7 +2654,7 @@ static void rotate_ctx(struct perf_event_context *ctx) * because they're strictly cpu affine and rotate_start is called with IRQs * disabled, while rotate_context is called from IRQ context. */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0, remove = 1; @@ -2594,6 +2693,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) done: if (remove) list_del_init(&cpuctx->rotation_list); + + return rotate; } #ifdef CONFIG_NO_HZ_FULL @@ -2625,10 +2726,6 @@ void perf_event_task_tick(void) ctx = cpuctx->task_ctx; if (ctx) perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); } } @@ -6001,7 +6098,9 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; + + __perf_cpu_hrtimer_init(cpuctx, cpu); + INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -7387,7 +7486,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; - default: break; } -- cgit v1.2.3 From 62b8563979273424d6ebe9201e34d1acc133ad4f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 3 Apr 2013 14:21:34 +0200 Subject: perf: Add sysfs entry to adjust multiplexing interval per PMU This patch adds /sys/device/xxx/perf_event_mux_interval_ms to ajust the multiplexing interval per PMU. The unit is milliseconds. Value has to be >= 1. In the 4th version, we renamed the sysfs file to be more consistent with the other /proc/sys/kernel entries for perf_events. In the 5th version, we handle the reprogramming of the hrtimer using hrtimer_forward_now(). That way, we sync up to new timer value quickly (suggested by Jiri Olsa). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1364991694-5876-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 97bfac7e6f45..53d1b300116a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -723,13 +723,21 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { struct hrtimer *hr = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + int timer; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) return; - cpuctx->hrtimer_interval = - ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER); + /* + * check default is sane, if not set then force to + * default interval (1/tick) + */ + timer = pmu->hrtimer_interval_ms; + if (timer < 1) + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); hr->function = perf_cpu_hrtimer_handler; @@ -6001,9 +6009,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); +} + +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pmu *pmu = dev_get_drvdata(dev); + int timer, cpu, ret; + + ret = kstrtoint(buf, 0, &timer); + if (ret) + return ret; + + if (timer < 1) + return -EINVAL; + + /* same value, noting to do */ + if (timer == pmu->hrtimer_interval_ms) + return count; + + pmu->hrtimer_interval_ms = timer; + + /* update all cpuctx for this PMU */ + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + if (hrtimer_active(&cpuctx->hrtimer)) + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + } + + return count; +} + +#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) + static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, + __ATTR_RO(type), + __ATTR_RW(perf_event_mux_interval_ms), + __ATTR_NULL, }; static int pmu_bus_running; -- cgit v1.2.3 From 2b923c8f5de6722393e614b096d5040b6d4eaf98 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 21 May 2013 12:53:37 +0200 Subject: perf/x86: Check branch sampling priv level in generic code This patch moves commit 7cc23cd to the generic code: perf/x86/intel/lbr: Demand proper privileges for PERF_SAMPLE_BRANCH_KERNEL The check is now implemented in generic code instead of x86 specific code. That way we do not have to repeat the test in each arch supporting branch sampling. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130521105337.GA2879@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 53d1b300116a..a0780b3a3d50 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6481,11 +6481,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; - /* kernel level capture: check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { @@ -6503,6 +6498,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, */ attr->branch_sample_type = mask; } + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_KERNEL) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { -- cgit v1.2.3 From 0c1061733aa0303e6536c0bc7f86d68f5eb55446 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 15 May 2013 11:04:10 +0200 Subject: rtmutex: Document rt_mutex_adjust_prio_chain() Parameters and usage of rt_mutex_adjust_prio_chain() are already documented in Documentation/rt-mutex-design.txt. However, since this function is called from several paths with different semantics (related to the arguments), it is handy to have a quick reference directly in the code. Signed-off-by: Juri Lelli Cc: Clark Williams Cc: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1368608650-7935-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a1..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -145,6 +145,19 @@ int max_lock_depth = 1024; /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + * needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter + * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, -- cgit v1.2.3 From c7e99fc75de8882bc4104455ace366d9d3599a96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:28:02 +0200 Subject: clockevents: Define CS_NAME_LEN unconditionally Unbreak architectures which do not use clockevents, but require to build some of the core timekeeping infrastructure Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index be1690eaecff..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,13 +6,13 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 -#define CS_NAME_LEN 32 - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; extern ktime_t tick_period; -- cgit v1.2.3 From 41261b6a832ea0e788627f6a8707854423f9ff49 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 24 May 2013 18:07:49 +0200 Subject: sched/autogroup: Fix race with task_groups list In autogroup_create(), a tg is allocated and added to the task_groups list. If CONFIG_RT_GROUP_SCHED is set, this tg is then modified while on the list, without locking. This can race with someone walking the list, like __enable_runtime() during CPU unplug, and result in a use-after-free bug. To fix this, move sched_online_group(), which adds the tg to the list, to the end of the autogroup_create() function after the modification. Signed-off-by: Gerald Schaefer Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369411669-46971-2-git-send-email-gerald.schaefer@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; - sched_online_group(tg, &root_task_group); - kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: -- cgit v1.2.3 From c5405a495e88d93cf9b4f4cc91507c7f4afcb901 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Thu, 11 Apr 2013 21:04:59 +0800 Subject: sched: Remove redundant update_runtime notifier migration_call() will do all the things that update_runtime() does. So let's remove it. Furthermore, there is potential risk that the current code will catch BUG_ON at line 689 of rt.c when do cpu hotplug while there are realtime threads running because of enabling runtime twice while the rt_runtime may already changed. Signed-off-by: Neil Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365685499-26515-1-git-send-email-zhangwm@marvell.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/rt.c | 40 ---------------------------------------- kernel/sched/sched.h | 1 - 3 files changed, 44 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfa7e77e0b50..79e48e6a9385 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6285,9 +6285,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - init_hrtick(); /* Move init over to a non-isolated CPU */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aced2e3b085..8853ab17b750 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -699,15 +699,6 @@ balanced: } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static void __enable_runtime(struct rq *rq) { rt_rq_iter_t iter; @@ -732,37 +723,6 @@ static void __enable_runtime(struct rq *rq) } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1f6256c1224..c806c61a1261 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1041,7 +1041,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -- cgit v1.2.3 From 77bd39702f0b3840cea17681409270b16a3b93c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:58 +0200 Subject: sched: Update rq clock before migrating tasks out of dying CPU Because the sched_class::put_prev_task() callback of rt and fair classes are referring to the rq clock to update their runtime statistics. There is a missing rq clock update from the CPU hotplug notifier's entry point of the scheduler. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79e48e6a9385..7bf0418dc60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4378,6 +4378,13 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* + * put_prev_task() and pick_next_task() sched + * class method both need to have an up-to-date + * value of rq->clock[_task] + */ + update_rq_clock(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only -- cgit v1.2.3 From 71b1da46ff70309a2ec12ce943aa0d192d2c8f0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:50:59 +0200 Subject: sched: Update rq clock before setting fair group shares Because we may update the execution time in sched_group_set_shares()->update_cfs_shares()->reweight_entity()->update_curr() before reweighting the entity while setting the group shares and this requires an uptodate version of the runqueue clock. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f62b16dfba63..f76ca21711bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6107,6 +6107,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); + + /* Possible calls to update_curr() need rq clock */ + update_rq_clock(rq); for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3 From 1ad4ec0dc740c4183acd6d6e367ca52b28e4fa94 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:00 +0200 Subject: sched: Update rq clock before calling check_preempt_curr() check_preempt_curr() of fair class needs an uptodate sched clock value to update runtime stats of the current task of the target's rq. When a task is woken up, activate_task() is usually called right before ttwu_do_wakeup() unless the task is still in the runqueue. In the latter case we need to update the rq clock explicitly because activate_task() isn't here to do the job for us. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bf0418dc60f..46d00172ae4a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1365,6 +1365,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (p->on_rq) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } -- cgit v1.2.3 From 1a55af2e45cce0ff13bc33c8ee99da84e188b615 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:01 +0200 Subject: sched: Update rq clock earlier in unthrottle_cfs_rq In this function we are making use of rq->clock right before the update of the rq clock, let's just call update_rq_clock() just before that to avoid using a stale rq clock value. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f76ca21711bb..1c8762a5370c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2319,12 +2319,14 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; cfs_rq->throttled = 0; + + update_rq_clock(rq); + raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - update_rq_clock(rq); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); -- cgit v1.2.3 From 78becc27097585c6aec7043834cadde950ae79f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 01:51:02 +0200 Subject: sched: Use an accessor to read the rq clock Read the runqueue clock through an accessor. This prepares for adding a debugging infrastructure to detect missing or redundant calls to update_rq_clock() between a scheduler's entry and exit point. Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Steven Rostedt Cc: Paul Turner Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1365724262-20142-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 44 ++++++++++++++++++++++---------------------- kernel/sched/rt.c | 8 ++++---- kernel/sched/sched.h | 10 ++++++++++ kernel/sched/stats.h | 8 ++++---- kernel/sched/stop_task.c | 8 ++++---- 6 files changed, 47 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46d00172ae4a..36f85be2932b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -667,7 +667,7 @@ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); - while ((s64)(rq->clock - rq->age_stamp) > period) { + while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. @@ -1328,7 +1328,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; + u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) @@ -2106,7 +2106,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; + ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) ns = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c8762a5370c..3ee1c2e4ae60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -704,7 +704,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; + u64 now = rq_clock_task(rq_of(cfs_rq)); unsigned long delta_exec; if (unlikely(!curr)) @@ -736,7 +736,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } /* @@ -756,14 +756,14 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); } #endif schedstat_set(se->statistics.wait_start, 0); @@ -789,7 +789,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_clock_task(rq_of(cfs_rq)); } /************************************************** @@ -1515,7 +1515,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } @@ -1530,7 +1530,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * accumulated while sleeping. */ if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); if (se->avg.decay_count) { /* * In a wake-up migration we have to approximate the @@ -1625,7 +1625,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) tsk = task_of(se); if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; @@ -1642,7 +1642,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; if ((s64)delta < 0) delta = 0; @@ -1823,9 +1823,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); } #endif } @@ -2100,7 +2100,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) if (unlikely(cfs_rq->throttle_count)) return cfs_rq->throttled_clock_task; - return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; + return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } /* returns 0 on failure to allocate runtime */ @@ -2159,7 +2159,7 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) return; if (cfs_rq->runtime_remaining < 0) @@ -2248,7 +2248,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ - cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } #endif @@ -2263,7 +2263,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - cfs_rq->throttled_clock_task = rq->clock_task; + cfs_rq->throttled_clock_task = rq_clock_task(rq); cfs_rq->throttle_count++; return 0; @@ -2302,7 +2302,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq->clock; + cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2323,7 +2323,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -2726,7 +2726,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) #else /* CONFIG_CFS_BANDWIDTH */ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { - return rq_of(cfs_rq)->clock_task; + return rq_clock_task(rq_of(cfs_rq)); } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -3966,7 +3966,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4322,7 +4322,7 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq->clock - age_stamp); + total = sched_avg_period() + (rq_clock(rq) - age_stamp); if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ @@ -5261,7 +5261,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - this_rq->idle_stamp = this_rq->clock; + this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8853ab17b750..8d85f9ac4262 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -886,7 +886,7 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -896,7 +896,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1345,7 +1345,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); return p; } @@ -1997,7 +1997,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c806c61a1261..74ff659e964f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -548,6 +548,16 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + return rq->clock_task; +} + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..17d7065c3872 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = task_rq(t)->clock; + t->sched_info.last_queued = rq_clock(task_rq(t)); } /* @@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = task_rq(t)->clock - + unsigned long long delta = rq_clock(task_rq(t)) - t->sched_info.last_arrival; rq_sched_info_depart(task_rq(t), delta); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84a..e08fbeeb54b9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) struct task_struct *stop = rq->stop; if (stop && stop->on_rq) { - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); return stop; } @@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) struct task_struct *curr = rq->curr; u64 delta_exec; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); } @@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); } static void switched_to_stop(struct rq *rq, struct task_struct *p) -- cgit v1.2.3 From 1eaff67266b6b6c97bbd33cf2c20577822836413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:48:46 +0200 Subject: clocksource: Implement clocksource_select_fallback() for CONFIG_ARCH_USES_GETTIMEOFFSET=y commit 7eaeb34305 (clocksource: Provide unbind interface in sysfs) implemented clocksource_select_fallback() which is not defined for CONFIG_ARCH_USES_GETTIMEOFFSET=y. Add an empty inline function for that. Reported-by: Ingo Molnar Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6d05b00410cc..e713ef7d19a7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -632,6 +632,7 @@ static void clocksource_select_fallback(void) #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif -- cgit v1.2.3 From a6572f84c5b135d9b6df279ed3c8de028bd1edd9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 May 2013 10:31:04 +0800 Subject: watchdog: Disallow setting watchdog_thresh to -1 In old kernels, it's allowed to set softlockup_thresh to -1 or 0 to disable softlockup detection. However watchdog_thresh only uses 0 to disable detection, and setting it to -1 just froze my box and nothing I can do but reboot. Signed-off-by: Li Zefan Acked-by: Don Zickus Link: http://lkml.kernel.org/r/51959668.9040106@huawei.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..b0a1f99907f3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -120,7 +120,6 @@ extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; -static int neg_one = -1; #endif static int zero; @@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dowatchdog, - .extra1 = &neg_one, + .extra1 = &zero, .extra2 = &sixty, }, { -- cgit v1.2.3 From 84f9f3a15611536537d59060818a2354d5039ff3 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 2 May 2013 15:34:33 +0200 Subject: sched: Use swap() macro in scale_stime() Simple cleanup. Reported-by: Peter Zijlstra Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1367501673-6563-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..94691bcd7364 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } + if (stime > rtime) + swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if (total >> 32) -- cgit v1.2.3 From cfeaa93f8a13ae9117ae20933a38a406de80849e Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:17 +0000 Subject: genirq: Generic chip: Remove the local cur_regs() function Since we already have an irq_data_get_chip_type() function which returns a pointer to irq_chip_type, use that instead of cur_regs(). Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Joey Oravec Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.010164766@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f668..0e6ba789056c 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -16,11 +16,6 @@ static LIST_HEAD(gc_list); static DEFINE_RAW_SPINLOCK(gc_lock); -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ - return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} - /** * irq_gc_noop - NOOP function * @d: irq_data @@ -39,10 +34,11 @@ void irq_gc_noop(struct irq_data *d) void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); + irq_reg_writel(mask, gc->reg_base + ct->regs.disable); gc->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -57,11 +53,12 @@ void irq_gc_mask_disable_reg(struct irq_data *d) void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -75,11 +72,12 @@ void irq_gc_mask_set_bit(struct irq_data *d) void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -93,10 +91,11 @@ void irq_gc_mask_clr_bit(struct irq_data *d) void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); + irq_reg_writel(mask, gc->reg_base + ct->regs.enable); gc->mask_cache |= mask; irq_gc_unlock(gc); } @@ -108,10 +107,11 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -122,10 +122,11 @@ void irq_gc_ack_set_bit(struct irq_data *d) void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = ~(1 << (d->irq - gc->irq_base)); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -136,11 +137,12 @@ void irq_gc_ack_clr_bit(struct irq_data *d) void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.mask); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -151,10 +153,11 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); irq_gc_unlock(gc); } -- cgit v1.2.3 From 899f0e66fff36ebb6dd6a83af9aa631f6cb7e0dc Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:19 +0000 Subject: genirq: Generic chip: Add support for per chip type mask cache Today the same interrupt mask cache (stored within struct irq_chip_generic) is shared between all the irq_chip_type instances. As there are instances where each irq_chip_type uses a distinct mask register (as it is the case for Orion SoCs), sharing a single mask cache may be incorrect. So add a distinct pointer for each irq_chip_type, which for now points to the original mask register within irq_chip_generic. So no functional changes here. [ tglx: Minor cosmetic tweaks ] Reported-by: Joey Oravec Signed-off-by: Simon Guinot Signed-off-by: Holger Brunck Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.082226607@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 0e6ba789056c..113d9ebfe0aa 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.disable); - gc->mask_cache &= ~mask; + *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -57,8 +57,8 @@ void irq_gc_mask_set_bit(struct irq_data *d) u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); + *ct->mask_cache |= mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -76,8 +76,8 @@ void irq_gc_mask_clr_bit(struct irq_data *d) u32 mask = 1 << (d->irq - gc->irq_base); irq_gc_lock(gc); - gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask); + *ct->mask_cache &= ~mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } @@ -96,7 +96,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.enable); - gc->mask_cache |= mask; + *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -250,6 +250,10 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_MASK_CACHE) gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + /* Initialize mask cache pointer */ + for (i = 0; i < gc->num_ct; i++) + ct[i].mask_cache = &gc->mask_cache; + for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) continue; -- cgit v1.2.3 From af80b0fed67261dcba2ce2406db1d553d07cbe75 Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 6 May 2013 14:30:21 +0000 Subject: genirq: Generic chip: Handle separate mask registers There are cases where all irq_chip_type instances have separate mask registers, making a shared mask register cache unsuitable for the purpose. Introduce a new flag IRQ_GC_MASK_CACHE_PER_TYPE. If set, point the per chip mask pointer to the per chip private mask cache instead. [ tglx: Simplified code, renamed flag and massaged changelog ] Signed-off-by: Gerlando Falauto Cc: Andrew Lunn Cc: Joey Oravec Cc: Lennert Buytenhek Cc: Russell King - ARM Linux Cc: Jason Gunthorpe Cc: Holger Brunck Cc: Ezequiel Garcia Acked-by: Grant Likely Cc: Sebastian Hesselbarth Cc: Jason Cooper Cc: Arnd Bergmann Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Ben Dooks Cc: Gregory Clement Cc: Simon Guinot Cc: linux-arm-kernel@lists.infradead.org Cc: Thomas Petazzoni Cc: Jean-Francois Moine Cc: Nicolas Pitre Cc: Rob Landley Cc: Maxime Ripard Link: http://lkml.kernel.org/r/20130506142539.152569748@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 113d9ebfe0aa..da2a94191fc5 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -241,18 +241,21 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, { struct irq_chip_type *ct = gc->chip_types; unsigned int i; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - /* Init mask cache ? */ - if (flags & IRQ_GC_INIT_MASK_CACHE) - gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); - - /* Initialize mask cache pointer */ - for (i = 0; i < gc->num_ct; i++) - ct[i].mask_cache = &gc->mask_cache; + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) -- cgit v1.2.3 From 966dc736b819999cd2d3a6408d47d33b579f7d56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:22 +0000 Subject: genirq: Generic chip: Cache per irq bit mask Cache the per irq bit mask instead of recalculating it over and over. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.227119865@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index da2a94191fc5..957155cebbac 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -35,7 +35,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.disable); @@ -54,7 +54,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); *ct->mask_cache |= mask; @@ -73,7 +73,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); *ct->mask_cache &= ~mask; @@ -92,7 +92,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.enable); @@ -108,7 +108,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.ack); @@ -123,7 +123,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = ~(1 << (d->irq - gc->irq_base)); + u32 mask = ~d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.ack); @@ -138,7 +138,7 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.mask); @@ -154,7 +154,7 @@ void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); struct irq_chip_type *ct = irq_data_get_chip_type(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; irq_gc_lock(gc); irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); @@ -172,7 +172,7 @@ void irq_gc_eoi(struct irq_data *d) int irq_gc_set_wake(struct irq_data *d, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; if (!(mask & gc->wake_enabled)) return -EINVAL; @@ -264,6 +264,11 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_NESTED_LOCK) irq_set_lockdep_class(i, &irq_nested_lock_class); + if (!(flags & IRQ_GC_NO_MASK)) { + struct irq_data *d = irq_get_irq_data(i); + + d->mask = 1 << (i - gc->irq_base); + } irq_set_chip_and_handler(i, &ct->chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); -- cgit v1.2.3 From d0051816e619f8f082582bec07ffa51bdb4f2104 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:24 +0000 Subject: genirq: irqchip: Add a mask calculation function Some chips have weird bit mask access patterns instead of the linear you expect. Allow them to calculate the cached mask themself. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.302898834@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 957155cebbac..5068fe3ae1af 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -240,6 +240,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int set) { struct irq_chip_type *ct = gc->chip_types; + struct irq_chip *chip = &ct->chip; unsigned int i; u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; @@ -267,9 +268,12 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); - d->mask = 1 << (i - gc->irq_base); + if (chip->irq_calc_mask) + chip->irq_calc_mask(d); + else + d->mask = 1 << (i - gc->irq_base); } - irq_set_chip_and_handler(i, &ct->chip, ct->handler); + irq_set_chip_and_handler(i, chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); } -- cgit v1.2.3 From 3528d82b684680b72fa31881c8c572c5a98b51de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:25 +0000 Subject: genirq: Generic chip: Split out code into separate functions Preparatory patch for linear interrupt domains. Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.377017672@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 50 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 5068fe3ae1af..3deb3333d53e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -186,6 +186,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; +} + /** * irq_alloc_generic_chip - Allocate a generic chip and initialize it * @name: Name of the irq chip @@ -206,17 +219,31 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, gc = kzalloc(sz, GFP_KERNEL); if (gc) { - raw_spin_lock_init(&gc->lock); - gc->num_ct = num_ct; - gc->irq_base = irq_base; - gc->reg_base = reg_base; - gc->chip_types->chip.name = name; - gc->chip_types->handler = handler; + irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, + handler); } return gc; } EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); +static void +irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) +{ + struct irq_chip_type *ct = gc->chip_types; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; + int i; + + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } +} + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. @@ -242,21 +269,12 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip_type *ct = gc->chip_types; struct irq_chip *chip = &ct->chip; unsigned int i; - u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - for (i = 0; i < gc->num_ct; i++) { - if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { - mskptr = &ct[i].mask_cache_priv; - mskreg = ct[i].regs.mask; - } - ct[i].mask_cache = mskptr; - if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); - } + irq_gc_init_mask_cache(gc, flags); for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) -- cgit v1.2.3 From 088f40b7b027dad6519712ff224a5798dd62a204 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 May 2013 14:30:27 +0000 Subject: genirq: Generic chip: Add linear irq domain support Provide infrastructure for irq chip implementations which work on linear irq domains. - Interface to allocate multiple generic chips which are associated to the irq domain. - Interface to get the generic chip pointer for a particular hardware interrupt in the domain. - irq domain mapping function to install the chip for a particular interrupt. Note: This lacks a removal function for now. [ Sebastian Hesselbarth: Mask cache and pointer math fixups ] Signed-off-by: Thomas Gleixner Cc: Thomas Petazzoni Cc: Andrew Lunn Cc: Russell King - ARM Linux Cc: Jason Cooper Cc: Arnd Bergmann Cc: Jean-Francois Moine Cc: devicetree-discuss@lists.ozlabs.org Cc: Rob Herring Cc: Jason Gunthorpe Cc: Gregory Clement Cc: Gerlando Falauto Cc: Rob Landley Acked-by: Grant Likely Cc: Maxime Ripard Cc: Ezequiel Garcia Cc: linux-arm-kernel@lists.infradead.org Cc: Sebastian Hesselbarth Link: http://lkml.kernel.org/r/20130506142539.450634298@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 187 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/irq/irqdomain.c | 6 -- 2 files changed, 181 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 3deb3333d53e..8743d62fded7 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -244,12 +245,156 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } } +/** + * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * @d: irq domain for which to allocate chips + * @irqs_per_chip: Number of interrupts each chip handles + * @num_ct: Number of irq_chip_type instances associated with this + * @name: Name of the irq chip + * @handler: Default flow handler associated with these chips + * @clr: IRQ_* bits to clear in the mapping function + * @set: IRQ_* bits to set in the mapping function + */ +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) +{ + struct irq_domain_chip_generic *dgc; + struct irq_chip_generic *gc; + int numchips, sz, i; + unsigned long flags; + void *tmp; + + if (d->gc) + return -EBUSY; + + if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) + return -EINVAL; + + numchips = d->revmap_data.linear.size / irqs_per_chip; + if (!numchips) + return -EINVAL; + + /* Allocate a pointer, generic chip and chiptypes for each chip */ + sz = sizeof(*dgc) + numchips * sizeof(gc); + sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + + tmp = dgc = kzalloc(sz, GFP_KERNEL); + if (!dgc) + return -ENOMEM; + dgc->irqs_per_chip = irqs_per_chip; + dgc->num_chips = numchips; + dgc->irq_flags_to_set = set; + dgc->irq_flags_to_clear = clr; + dgc->gc_flags = gcflags; + d->gc = dgc; + + /* Calc pointer to the first generic chip */ + tmp += sizeof(*dgc) + numchips * sizeof(gc); + for (i = 0; i < numchips; i++) { + /* Store the pointer to the generic chip */ + dgc->gc[i] = gc = tmp; + irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, + NULL, handler); + gc->domain = d; + raw_spin_lock_irqsave(&gc_lock, flags); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock_irqrestore(&gc_lock, flags); + /* Calc pointer to the next generic chip */ + tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + } + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); + +/** + * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq + * @d: irq domain pointer + * @hw_irq: Hardware interrupt number + */ +struct irq_chip_generic * +irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return NULL; + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return NULL; + return dgc->gc[idx]; +} +EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. */ static struct lock_class_key irq_nested_lock_class; +/** + * irq_map_generic_chip - Map a generic chip for an irq domain + */ +static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain_chip_generic *dgc = d->gc; + struct irq_chip_generic *gc; + struct irq_chip_type *ct; + struct irq_chip *chip; + unsigned long flags; + int idx; + + if (!d->gc) + return -ENODEV; + + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return -EINVAL; + gc = dgc->gc[idx]; + + idx = hw_irq % dgc->irqs_per_chip; + + if (test_bit(idx, &gc->installed)) + return -EBUSY; + + ct = gc->chip_types; + chip = &ct->chip; + + /* We only init the cache for the first mapping of a generic chip */ + if (!gc->installed) { + raw_spin_lock_irqsave(&gc->lock, flags); + irq_gc_init_mask_cache(gc, dgc->gc_flags); + raw_spin_unlock_irqrestore(&gc->lock, flags); + } + + /* Mark the interrupt as installed */ + set_bit(idx, &gc->installed); + + if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(virq, &irq_nested_lock_class); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(data); + else + data->mask = 1 << idx; + + irq_set_chip_and_handler(virq, chip, ct->handler); + irq_set_chip_data(virq, gc); + irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); + return 0; +} + +struct irq_domain_ops irq_generic_chip_ops = { + .map = irq_map_generic_chip, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_generic_chip_ops); + /** * irq_setup_generic_chip - Setup a range of interrupts with a generic chip * @gc: Generic irq chip holding all data @@ -354,6 +499,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); +static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) +{ + unsigned int virq; + + if (!gc->domain) + return irq_get_irq_data(gc->irq_base); + + /* + * We don't know which of the irqs has been actually + * installed. Use the first one. + */ + if (!gc->installed) + return NULL; + + virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); + return virq ? irq_get_irq_data(virq) : NULL; +} + #ifdef CONFIG_PM static int irq_gc_suspend(void) { @@ -362,8 +525,12 @@ static int irq_gc_suspend(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_suspend) - ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_suspend) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_suspend(data); + } } return 0; } @@ -375,8 +542,12 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_resume) - ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_resume) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_resume(data); + } } } #else @@ -391,8 +562,12 @@ static void irq_gc_shutdown(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_pm_shutdown) - ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_pm_shutdown) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_pm_shutdown(data); + } } } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0c..1db9e70f5488 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -16,12 +16,6 @@ #include #include -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ - static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -- cgit v1.2.3 From e8bd834f73714378ef110a64287db1b77033c8da Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:52 +0100 Subject: genirq: irqchip: Add mask to block out invalid irqs Some controllers have irqs that aren't wired up and must never be used. For the generic chip attached to an irq_domain this provides a mask that can be used to block out particular irqs so that they never get mapped. Signed-off-by: Grant Likely Link: http://lkml.kernel.org/r/1369793454-19197-2-git-send-email-grant.likely@linaro.org Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 8743d62fded7..95575d8d5392 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -359,6 +359,9 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, idx = hw_irq % dgc->irqs_per_chip; + if (test_bit(idx, &gc->unused)) + return -ENOTSUPP; + if (test_bit(idx, &gc->installed)) return -EBUSY; -- cgit v1.2.3 From d671a605580d2caafc77f1a25bcf8435795df6fe Mon Sep 17 00:00:00 2001 From: Andreas Fenkart Date: Fri, 10 May 2013 12:21:30 +0200 Subject: genirq: Add kerneldoc for irq_disable. Document the lazy disable functionality. comment based on changelog of d209a699a0b975ad Signed-off-by: Andreas Fenkart Cc: balbi@ti.com Link: http://lkml.kernel.org/r/1368181290-1583-1-git-send-email-andreas.fenkart@streamunlimited.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b000..a3bb14fbe5c6 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +/** + * irq_disable - Mark interupt disabled + * @desc: irq descriptor which should be disabled + * + * If the chip does not implement the irq_disable callback, we + * use a lazy disable approach. That means we mark the interrupt + * disabled, but leave the hardware unmasked. That's an + * optimization because we avoid the hardware access for the + * common case where no interrupt happens after we marked it + * disabled. If an interrupt happens, then the interrupt flow + * handler masks the line at the hardware level and marks it + * pending. + */ void irq_disable(struct irq_desc *desc) { irq_state_set_disabled(desc); -- cgit v1.2.3 From 6cffe00f7d4e24679eae6b7aae4caaf915288256 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Wed, 15 May 2013 14:38:11 -0700 Subject: alarmtimer: Add functions for timerfd support Add functions needed for hooking up alarmtimer to timerfd: * alarm_restart: Similar to hrtimer_restart, restart an alarmtimer after the expires time has already been updated (as with alarm_forward). * alarm_forward_now: Similar to hrtimer_forward_now, move the expires time forward to an interval from the current time of the associated clock. * alarm_start_relative: Start an alarmtimer with an expires time relative to the current time of the associated clock. * alarm_expires_remaining: Similar to hrtimer_expires_remaining, return the amount of time remaining until alarm expiry. Signed-off-by: Todd Poynor Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..3e5cba274475 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,12 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -305,7 +311,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, } /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -324,6 +330,31 @@ int alarm_start(struct alarm *alarm, ktime_t start) return ret; } +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} + /** * alarm_try_to_cancel - Tries to cancel an alarm timer * @alarm: ptr to alarm to be canceled @@ -394,6 +425,12 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) return overrun; } +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} -- cgit v1.2.3 From 5c83545f24ab3dd67e0ae0e2b795fea750f08c34 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Tue, 21 May 2013 22:32:14 -0700 Subject: power: Add option to log time spent in suspend Below is a patch from android kernel that maintains a histogram of suspend times. Please review and provide feedback. Statistices on the time spent in suspend are kept in /sys/kernel/debug/sleep_time. Cc: Android Kernel Team Cc: Colin Cross Cc: Todd Poynor Cc: San Mehat Cc: Benoit Goby Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Colin Cross Signed-off-by: Todd Poynor [zoran.markovic@linaro.org: Re-formatted suspend time table to better fit expected values. Moved accounting of suspend time into timekeeping core. Removed CONFIG_SUSPEND_TIME flag and made the feature conditional on CONFIG_DEBUG_FS. Changed the file name to sleep_time to better fit terminology in timekeeping core. Changed seq_printf to seq_puts. Tweaked commit message] Signed-off-by: Zoran Markovic Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/timekeeping.c | 2 ++ kernel/time/timekeeping_debug.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping_internal.h | 14 ++++++++ 4 files changed, 89 insertions(+) create mode 100644 kernel/time/timekeeping_debug.c create mode 100644 kernel/time/timekeeping_internal.h (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..d52ac8bf0006 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 933efa4071c3..838fc0777b68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,7 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -851,6 +852,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3 From 0de358f1c2642710d41190b73fbc295e675c4ab8 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 30 May 2013 14:34:20 +0530 Subject: sched/fair: Remove unused variable from expire_cfs_rq_runtime() Commit 78becc2709 ("sched: Use an accessor to read the rq clock") introduces rq_clock(), which obsoletes the use of the "rq" variable in expire_cfs_rq_runtime() and triggers this build warning: kernel/sched/fair.c: In function 'expire_cfs_rq_runtime': kernel/sched/fair.c:2159:13: warning: unused variable 'rq' [-Wunused-variable] Signed-off-by: Kamalesh Babulal Acked-by: Frederic Weisbecker Acked-by: Paul Turner Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1369904660-14169-1-git-send-email-kamalesh@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ee1c2e4ae60..143dcdbc47af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2156,7 +2156,6 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) -- cgit v1.2.3 From cd38ca854de15b26eb91009137cbe157d8a8e773 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 3 Jun 2013 18:20:29 +0000 Subject: PM / Hibernate: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. Commit 69f1d475cc did this for a similar printk in this file, but I must have missed this one. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807d..7872a35eafe7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } /* -- cgit v1.2.3 From 40b313608ad4ea655addd2ec6cdd106477ae8e15 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 21 May 2013 13:49:35 +1000 Subject: Finally eradicate CONFIG_HOTPLUG Ever since commit 45f035ab9b8f ("CONFIG_HOTPLUG should be always on"), it has been basically impossible to build a kernel with CONFIG_HOTPLUG turned off. Remove all the remaining references to it. Cc: Russell King Cc: Doug Thompson Cc: Bjorn Helgaas Cc: Steven Whitehouse Cc: Arnd Bergmann Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: Stephen Rothwell Acked-by: Mauro Carvalho Chehab Acked-by: Hans Verkuil Signed-off-by: Greg Kroah-Hartman --- kernel/power/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180b..9c39de095ba9 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP - select HOTPLUG select HOTPLUG_CPU config PM_AUTOSLEEP -- cgit v1.2.3 From f12dc020149fad7087e119e54cffea668272bf7d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:02 -0700 Subject: cgroup: mark "tasks" cgroup file as insane Some resources controlled by cgroup aren't per-task and cgroup core allowing threads of a single thread_group to be in different cgroups forced memcg do explicitly find the group leader and use it. This is gonna be nasty when transitioning to unified hierarchy and in general we don't want and won't support granularity finer than processes. Mark "tasks" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org Cc: Vivek Goyal --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fefc41c1a147..1e0f445b5b88 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4037,6 +4037,7 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, static struct cftype files[] = { { .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, .release = cgroup_pidlist_release, -- cgit v1.2.3 From cc5943a7816ba6c00639837a62131386619548dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:13:55 -0700 Subject: cgroup: mark "notify_on_release" and "release_agent" cgroup files insane The empty cgroup notification mechanism currently implemented in cgroup is tragically outdated. Forking and execing userland process stopped being a viable notification mechanism more than a decade ago. We're gonna have a saner mechanism. Let's make it clear that this abomination is going away. Mark "notify_on_release" and "release_agent" with CFTYPE_INSANE. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e0f445b5b88..b3bb8a393642 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4052,6 +4052,7 @@ static struct cftype files[] = { }, { .name = "notify_on_release", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, @@ -4073,7 +4074,7 @@ static struct cftype files[] = { }, { .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, -- cgit v1.2.3 From d5c56ced775f6bdc32b689b01c9c4f9b66e18610 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jun 2013 19:14:34 -0700 Subject: cgroup: clean up the cftype array for the base cgroup files * Rename it from files[] (really?) to cgroup_base_files[]. * Drop CGROUP_FILE_GENERIC_PREFIX which was defined as "cgroup." and used inconsistently. Just use "cgroup." directly. * Collect insane files at the end. Note that only the insane ones are missing "cgroup." prefix. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b3bb8a393642..bc53d5014b28 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4029,35 +4029,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, return 0; } -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { +static struct cftype cgroup_base_files[] = { { - .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, - .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .name = "cgroup.procs", .open = cgroup_procs_open, .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "notify_on_release", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .name = "cgroup.event_control", .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, @@ -4072,6 +4053,26 @@ static struct cftype files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_sane_behavior_show, }, + + /* + * Historical crazy stuff. These don't have "cgroup." prefix and + * don't exist if sane_behavior. If you're depending on these, be + * prepared to be burned. + */ + { + .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ + .open = cgroup_tasks_open, + .write_u64 = cgroup_tasks_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "notify_on_release", + .flags = CFTYPE_INSANE, + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, @@ -4095,7 +4096,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, struct cgroup_subsys *ss; if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, files, true); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err < 0) return err; } -- cgit v1.2.3 From 06d6b3cbdf94bc37732df83e7c25774370411a56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:11 +0800 Subject: cpuset: remove redundant check in cpuset_cpus_allowed_fallback() task_cs() will never return NULL. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..f0c884a0e574 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2253,8 +2253,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); - if (cs) - do_set_cpus_allowed(tsk, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* -- cgit v1.2.3 From 40df2deb50570b288b7067b111af0aa9ca640e6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:23 +0800 Subject: cpuset: cleanup guarantee_online_{cpus|mems}() - We never pass a NULL @cs to these functions. - The top cpuset always has some online cpus/mems. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0c884a0e574..d753837cca33 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -304,53 +304,38 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_mask. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_mask. + * until we find one that does have some online cpus. The top + * cpuset always has some cpus online. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_mutex held. */ - static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { - while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); - if (cs) - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); - else - cpumask_copy(pmask, cpu_online_mask); - BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_MEMORY]. + * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ - static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_MEMORY])) + while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_MEMORY]); - else - *pmask = node_states[N_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); + nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); } /* -- cgit v1.2.3 From 67bd2c59850de20d0ecdc8084cbbfe34e53b6804 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:35 +0800 Subject: cpuset: remove unnecessary variable in cpuset_attach() We can just use oldcs->mems_allowed. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d753837cca33..dbef832e5e2d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1407,8 +1407,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { - /* static bufs protected by cpuset_mutex */ - static nodemask_t cpuset_attach_nodemask_from; + /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; @@ -1442,13 +1441,12 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(leader); if (mm) { mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &cpuset_attach_nodemask_to); mmput(mm); } -- cgit v1.2.3 From 249cc86db7492dc8de1d2eddebc6bcc4ab2a8e9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:48 +0800 Subject: cpuset: remove cpuset_test_cpumask() The test is done in set_cpus_allowed_ptr(), so it's redundant. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dbef832e5e2d..51f8e1d5a2a9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -783,23 +783,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Call with cpuset_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). - */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - return !cpumask_equal(&tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); -} - /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test @@ -835,7 +818,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) struct cgroup_scanner scan; scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; + scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; cgroup_scan_tasks(&scan); -- cgit v1.2.3 From a73456f37b9dbc917398387d0cba926b4455b70f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 5 Jun 2013 17:15:59 +0800 Subject: cpuset: re-structure update_cpumask() a bit Check if cpus_allowed is to be changed before calling validate_change(). This won't change any behavior, but later it will allow us to do this: # mkdir /cpuset/child # echo $$ > /cpuset/child/tasks /* empty cpuset */ # echo > /cpuset/child/cpuset.cpus /* do nothing, won't fail */ Without this patch, the last operation will fail. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51f8e1d5a2a9..535dce685eec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -856,14 +856,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; + retval = validate_change(cs, trialcs); + if (retval < 0) + return retval; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; -- cgit v1.2.3 From c5a130325f13b219438cb100e2da71a3e31199f3 Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Thu, 6 Jun 2013 15:20:51 -0700 Subject: ACPI/APEI: Add parameter check before error injection When param1 is enabled in EINJ but not assigned with a valid value, sometimes it will cause the error like below: APEI: Can not request [mem 0x7aaa7000-0x7aaa7007] for APEI EINJ Trigger registers It is because some firmware will access target address specified in param1 to trigger the error when injecting memory error. This will cause resource conflict with regular memory. So It must be removed from trigger table resources, but incorrect param1/param2 combination will stop this action. Add extra check to avoid this kind of error. Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- kernel/resource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index d7386986e10e..77bf11a86c7d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn) { return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +EXPORT_SYMBOL_GPL(page_is_ram); void __weak arch_remove_reservations(struct resource *avail) { -- cgit v1.2.3 From e44193d39e8d4d1de5d996fcd37ed75e5c704f10 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:22 +0800 Subject: cpuset: let hotplug propagation work wait for task attaching Instead of triggering propagation work in cpuset_attach(), we make hotplug propagation work wait until there's no task attaching in progress. IMO this is more robust. We won't see empty masks in cpuset_attach(). Also it's a preparation for removing propagation work. Without asynchronous propagation we can't call move_tasks_in_empty_cpuset() in cpuset_attach(), because otherwise we'll deadlock on cgroup_mutex. tj: typo fixes. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 535dce685eec..e902473f76bf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -59,6 +59,7 @@ #include #include #include +#include /* * Tracks how many cpusets are currently defined in system. @@ -275,6 +276,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -1436,14 +1439,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) } cs->attach_in_progress--; - - /* - * We may have raced with CPU/memory hotunplug. Trigger hotplug - * propagation if @cs doesn't have any CPU or memory. It will move - * the newly added tasks to the nearest parent which can execute. - */ - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - schedule_cpuset_propagate_hotplug(cs); + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } @@ -1555,10 +1552,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. - * - * Flushing cpuset_hotplug_work is enough to synchronize against - * hotplug hanlding; however, cpuset_attach() may schedule - * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); flush_workqueue(cpuset_propagate_hotplug_wq); @@ -2005,8 +1998,20 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; +retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); + mutex_lock(&cpuset_mutex); + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { + mutex_unlock(&cpuset_mutex); + goto retry; + } + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); -- cgit v1.2.3 From 388afd8549dc8be0920e00ae9404341593b6bd7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:14:47 +0800 Subject: cpuset: remove async hotplug propagation work As we can drop rcu read lock while iterating cgroup hierarchy, we don't have to do propagation asynchronously via workqueue. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 69 +++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e902473f76bf..608fe1308b22 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -101,8 +101,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - - struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -268,12 +266,7 @@ static DEFINE_MUTEX(callback_mutex); /* * CPU / memory hotplug is handled asynchronously. */ -static struct workqueue_struct *cpuset_propagate_hotplug_wq; - static void cpuset_hotplug_workfn(struct work_struct *work); -static void cpuset_propagate_hotplug_workfn(struct work_struct *work); -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); - static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -1554,7 +1547,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * after execution capability is restored. */ flush_work(&cpuset_hotplug_work); - flush_workqueue(cpuset_propagate_hotplug_wq); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -1821,7 +1813,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); - INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; return &cs->css; @@ -1984,18 +1975,17 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /** - * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_propagate_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug_update_tasks(struct cpuset *cs) { static cpumask_t off_cpus; static nodemask_t off_mems, tmp_mems; - struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); bool is_empty; retry: @@ -2044,34 +2034,6 @@ retry: */ if (is_empty) remove_tasks_in_empty_cpuset(cs); - - /* the following may free @cs, should be the last operation */ - css_put(&cs->css); -} - -/** - * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset - * @cs: cpuset of interest - * - * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and - * memory masks according to top_cpuset. - */ -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) -{ - /* - * Pin @cs. The refcnt will be released when the work item - * finishes executing. - */ - if (!css_tryget(&cs->css)) - return; - - /* - * Queue @cs->hotplug_work. If already pending, lose the css ref. - * cpuset_propagate_hotplug_wq is ordered and propagation will - * happen in the order this function is called. - */ - if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) - css_put(&cs->css); } /** @@ -2084,8 +2046,8 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory - * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all - * descendants. + * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on + * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. @@ -2128,21 +2090,26 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); } + mutex_unlock(&cpuset_mutex); + /* if cpus or mems went down, we need to propagate to descendants */ if (cpus_offlined || mems_offlined) { struct cpuset *cs; struct cgroup *pos_cgrp; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); - rcu_read_unlock(); - } + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + if (!css_tryget(&cs->css)) + continue; + rcu_read_unlock(); - mutex_unlock(&cpuset_mutex); + cpuset_hotplug_update_tasks(cs); - /* wait for propagations to finish */ - flush_workqueue(cpuset_propagate_hotplug_wq); + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); + } /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) @@ -2193,10 +2160,6 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - - cpuset_propagate_hotplug_wq = - alloc_ordered_workqueue("cpuset_hotplug", 0); - BUG_ON(!cpuset_propagate_hotplug_wq); } /** -- cgit v1.2.3 From 5e1cda5b8ae93f5f02e8c5a30390ac9b4d2c20e6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 29 May 2013 03:10:53 +0100 Subject: irqdomain: Relax failure path on setting up mappings Commit 98aa468e, "irqdomain: Support for static IRQ mapping and association" introduced an API for directly associating blocks of hwirqs to linux irqs. However, if any irq in that block failed to map (say if the mapping functions returns an error because the irq is already mapped) then the whole thing will fail and roll back. This is probably too aggressive since there are valid reasons why a mapping may fail. ie. Firmware may have a particular IRQ marked as unusable. This patch drops the error path out of irq_domain_associate(). If a mapping fails, then it is simply skipped. There is no reason to fail the entire allocation. v2: Still output an information message on failed mappings and make sure attempted mapping gets cleared out of the irq_data structure. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/irqdomain.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 20b677dd0b27..61d6d3c80fee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -464,23 +464,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, /* * If map() returns -EPERM, this interrupt is protected * by the firmware or some other service and shall not - * be mapped. - * - * Since on some platforms we blindly try to map everything - * we end up with a log full of backtraces. - * - * So instead, we silently fail on -EPERM, it is the - * responsibility of the PIC driver to display a relevant - * message if needed. + * be mapped. Don't bother telling the user about it. */ if (ret != -EPERM) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + of_node_full_name(domain->of_node), hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; - goto err_unmap; + continue; } } -- cgit v1.2.3 From 9bbf877d3b6b8c5991000296f40a3f0fe66fa89b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 12:10:24 +0100 Subject: irqdomain: Replace LEGACY mapping with LINEAR The LEGACY mapping unnecessarily complicates the irqdomain code and can easily be implemented with a linear mapping. By ripping it out and replacing it with the LINEAR mapping the object size of irqdomain.c shrinks by about 330 bytes (ARMv7) which offsets the additional allocation required by the linear map. It also makes it possible for current LEGACY map users to pre-allocate irq_descs for a subset of the hwirqs and dynamically allocate the rest as needed. Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- kernel/irq/irqdomain.c | 84 ++++---------------------------------------------- 1 file changed, 6 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 61d6d3c80fee..1ac8cf41b9a5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -82,13 +82,6 @@ void irq_domain_remove(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* - * Legacy domains don't manage their own irq_desc - * allocations, we expect the caller to handle irq_desc - * freeing on their own. - */ - break; case IRQ_DOMAIN_MAP_TREE: /* * radix_tree_delete() takes care of destroying the root @@ -122,17 +115,6 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; - int size = domain->revmap_data.legacy.size; - - if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) - return 0; - return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} - /** * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. @@ -213,57 +195,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int i; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", + first_irq, first_irq + size - 1, + (int)first_hwirq, (int)first_hwirq + size -1); + + domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); if (!domain) return NULL; - domain->revmap_data.legacy.first_irq = first_irq; - domain->revmap_data.legacy.first_hwirq = first_hwirq; - domain->revmap_data.legacy.size = size; - - mutex_lock(&irq_domain_mutex); - /* Verify that all the irqs are available */ - for (i = 0; i < size; i++) { - int irq = first_irq + i; - struct irq_data *irq_data = irq_get_irq_data(irq); - - if (WARN_ON(!irq_data || irq_data->domain)) { - mutex_unlock(&irq_domain_mutex); - irq_domain_free(domain); - return NULL; - } - } + WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); - /* Claim all of the irqs before registering a legacy domain */ - for (i = 0; i < size; i++) { - struct irq_data *irq_data = irq_get_irq_data(first_irq + i); - irq_data->hwirq = first_hwirq + i; - irq_data->domain = domain; - } - mutex_unlock(&irq_domain_mutex); - - for (i = 0; i < size; i++) { - int irq = first_irq + i; - int hwirq = first_hwirq + i; - - /* IRQ0 gets ignored */ - if (!irq) - continue; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - if (ops->map) - ops->map(domain, irq, hwirq); - - /* Clear norequest flags */ - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } - - irq_domain_add(domain); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); @@ -492,10 +434,6 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, } return 0; - - err_unmap: - irq_domain_disassociate_many(domain, irq_base, i); - return -EINVAL; } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -575,10 +513,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } - /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return irq_domain_legacy_revmap(domain, hwirq); - /* Allocate a virtual interrupt number */ hint = hwirq % nr_irqs; if (hint == 0) @@ -706,10 +640,6 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - /* Never unmap legacy interrupts */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } @@ -732,8 +662,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - return irq_domain_legacy_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); case IRQ_DOMAIN_MAP_TREE: -- cgit v1.2.3 From 0bb4afb45dd1add73ca643a865daa38716aeff0c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 14:23:30 +0100 Subject: irqdomain: Add a name field This patch adds a name field to the irq_domain structure to help mere mortals understand the mappings between irq domains and virqs. It also converts a number of places that have open-coded some kind of fudging an irqdomain name to use the new field. This means a more consistent display of names in irq domain log messages and debugfs output. Signed-off-by: Grant Likely --- kernel/irq/generic-chip.c | 1 + kernel/irq/irqdomain.c | 19 ++++++------------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 95575d8d5392..ca98cc5d6308 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -305,6 +305,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } + d->name = name; return 0; } EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ac8cf41b9a5..b1b5e6793fd2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -410,12 +410,15 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, */ if (ret != -EPERM) { pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - of_node_full_name(domain->of_node), hwirq, virq, ret); + domain->name, hwirq, virq, ret); } irq_data->domain = NULL; irq_data->hwirq = 0; continue; } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; } switch (domain->revmap_type) { @@ -708,8 +711,6 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - const char *p; - static const char none[] = "none"; void *data; int i; @@ -731,20 +732,12 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain) - p = of_node_full_name(desc->irq_data.domain->of_node); - else - p = none; - seq_printf(m, "%s\n", p); + seq_printf(m, "%s\n", desc->irq_data.domain->name); } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From cef5075c8c238ffd04c86a77a5a9bdbd18031137 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Jul 2012 17:24:31 +0100 Subject: irqdomain: merge linear and tree reverse mappings. Keeping them separate makes irq_domain more complex and adds a lot of code (as proven by the diffstat). Merging them simplifies the whole scheme. This change makes it so both the tree and linear methods can be used by the same irq_domain instance. If the hwirq is less than the ->linear_size, then the linear map is used to reverse map the hwirq. Otherwise the radix tree is used. The test for which map to use is no more expensive that the existing code, so the performance of fast path is preserved. It also means that complex interrupt controllers can use both the linear map and a tree in the same domain. This may be useful for an interrupt controller with a base set of core irqs and a large number of GPIOs which might be used as irqs. The linear map could cover the core irqs, and the tree used for thas irqs. The linear map could cover the core irqs, and the tree used for the gpios. v2: Drop reorganization of revmap data Signed-off-by: Grant Likely Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Rob Herring --- kernel/irq/irqdomain.c | 107 ++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b1b5e6793fd2..5a1d8ec8509e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -34,22 +34,24 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, + unsigned int revmap_type, int size, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; - domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, - of_node_to_nid(of_node)); + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), + GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; /* Fill structure */ + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); domain->revmap_type = revmap_type; domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->revmap_data.linear.size = size; return domain; } @@ -81,22 +83,12 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_TREE: - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_data.tree.height); - break; - case IRQ_DOMAIN_MAP_LINEAR: - kfree(domain->revmap_data.linear.revmap); - domain->revmap_data.linear.size = 0; - break; - case IRQ_DOMAIN_MAP_NOMAP: - break; - } + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_data.tree.height); list_del(&domain->link); @@ -223,20 +215,11 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int *revmap; - revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, - of_node_to_nid(of_node)); - if (WARN_ON(!revmap)) + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + if (!domain) return NULL; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); - if (!domain) { - kfree(revmap); - return NULL; - } - domain->revmap_data.linear.size = size; - domain->revmap_data.linear.revmap = revmap; irq_domain_add(domain); return domain; } @@ -248,7 +231,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); if (domain) { domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); @@ -257,28 +240,6 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_nomap); -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_TREE, ops, host_data); - if (domain) { - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller @@ -359,17 +320,13 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->domain = NULL; irq_data->hwirq = 0; - /* Clear reverse map */ - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = 0; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); - break; } } } @@ -421,16 +378,12 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = virq; - break; - case IRQ_DOMAIN_MAP_TREE: + if (hwirq < domain->revmap_data.linear.size) { + domain->linear_revmap[hwirq] = virq; + } else { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); - break; } irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -667,13 +620,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, switch (domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_TREE: - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - if (data) - return data->irq; - break; case IRQ_DOMAIN_MAP_NOMAP: data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) @@ -696,13 +642,18 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct irq_data *data; BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) - return 0; + if (hwirq >= domain->revmap_data.linear.size) { + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; + } - return domain->revmap_data.linear.revmap[hwirq]; + return domain->linear_revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); -- cgit v1.2.3 From 1aa0dd94ca07df818cf14588c9031ab1d7fd84d3 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:03:59 +0100 Subject: irqdomain: Eliminate revmap type The NOMAP irq_domain type is only used by a handful of interrupt controllers and it unnecessarily complicates the code by adding special cases on how to look up mappings and different revmap functions are used for each type which need to validate the correct type is passed to it before performing the reverse map. Eliminating the revmap_type and making a single reverse mapping function simplifies the code. It also shouldn't be any slower than having separate revmap functions because the type of the revmap needed to be checked anyway. The linear and tree revmap types were already merged in a previous patch. This patch rolls the NOMAP or direct mapping behaviour into the same domain code making is possible for an irq domain to do any mapping type; linear, tree or direct; and that the mapping will be transparent to the interrupt controller driver. With this change, direct mappings will get stored in the linear or tree mapping for consistency. Reverse mapping from the hwirq to virq will go through the normal lookup process. However, any controller using a direct mapping can take advantage of knowing that hwirq==virq for any mapped interrupts skip doing a revmap lookup when handling IRQs. Signed-off-by: Grant Likely --- kernel/irq/generic-chip.c | 5 +---- kernel/irq/irqdomain.c | 55 +++++++++++++++++++---------------------------- 2 files changed, 23 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ca98cc5d6308..4b011064e146 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -270,10 +270,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->gc) return -EBUSY; - if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) - return -EINVAL; - - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = d->revmap_size / irqs_per_chip; if (!numchips) return -EINVAL; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a1d8ec8509e..c38be78fceb4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,7 +25,6 @@ static struct irq_domain *irq_default_domain; /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -34,7 +33,7 @@ static struct irq_domain *irq_default_domain; * to IRQ domain, or NULL on failure. */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, int size, + int size, const struct irq_domain_ops *ops, void *host_data) { @@ -46,12 +45,11 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, return NULL; /* Fill structure */ - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - domain->revmap_type = revmap_type; + INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); - domain->revmap_data.linear.size = size; + domain->revmap_size = size; return domain; } @@ -67,8 +65,7 @@ static void irq_domain_add(struct irq_domain *domain) mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("Allocated domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Added domain %s\n", domain->name); } /** @@ -88,7 +85,7 @@ void irq_domain_remove(struct irq_domain *domain) * node when all entries are removed. Shout if there are * any mappings left. */ - WARN_ON(domain->revmap_data.tree.height); + WARN_ON(domain->revmap_tree.height); list_del(&domain->link); @@ -100,8 +97,7 @@ void irq_domain_remove(struct irq_domain *domain) mutex_unlock(&irq_domain_mutex); - pr_debug("Removed domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Removed domain %s\n", domain->name); irq_domain_free(domain); } @@ -216,7 +212,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, { struct irq_domain *domain; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, size, ops, host_data); + domain = irq_domain_alloc(of_node, size, ops, host_data); if (!domain) return NULL; @@ -230,10 +226,9 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, 0, ops, host_data); + struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); if (domain) { - domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; + domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); } return domain; @@ -321,11 +316,11 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, irq_data->hwirq = 0; /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_tree, hwirq); mutex_unlock(&revmap_trees_mutex); } } @@ -378,11 +373,11 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, domain->name = irq_data->chip->name; } - if (hwirq < domain->revmap_data.linear.size) { + if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } @@ -399,7 +394,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { @@ -408,17 +405,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) - return 0; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_data.nomap.max_irq) { + if (virq >= domain->revmap_direct_max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_data.nomap.max_irq); + domain->revmap_direct_max_irq); irq_free_desc(virq); return 0; } @@ -617,17 +611,13 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_NOMAP: + if (hwirq < domain->revmap_direct_max_irq) { data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) return hwirq; - break; } - return 0; + return irq_linear_revmap(domain, hwirq); } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -643,12 +633,11 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *data; - BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_data.linear.size) { + if (hwirq >= domain->revmap_size) { rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); rcu_read_unlock(); return data ? data->irq : 0; } -- cgit v1.2.3 From fa40f377577752b83252b9d2b3165d4acee0eb7c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 8 Jun 2013 12:57:40 +0100 Subject: irqdomain: Clean up aftermath of irq_domain refactoring After refactoring the irqdomain code, there are a number of API functions that are merely empty wrappers around core code. Drop those wrappers out of the C file and replace them with static inlines in the header. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 127 ++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c38be78fceb4..e0db59e2eef6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -32,10 +35,10 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - int size, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, + int size, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; @@ -50,23 +53,16 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); domain->revmap_size = size; + domain->revmap_direct_max_irq = direct_max; - return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ - of_node_put(domain->of_node); - kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{ mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); + pr_debug("Added domain %s\n", domain->name); + return domain; } +EXPORT_SYMBOL_GPL(__irq_domain_add); /** * irq_domain_remove() - Remove an irq domain. @@ -99,30 +95,28 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - irq_domain_free(domain); + of_node_put(domain->of_node); + kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, - * pass zero to assign irqs on-the-fly. This will result in a - * linear IRQ domain so it is important to use irq_create_mapping() - * for each used IRQ, especially when SPARSE_IRQ is enabled. + * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, @@ -130,33 +124,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) { - int irq_base; + struct irq_domain *domain; + + domain = __irq_domain_add(of_node, size, 0, ops, host_data); + if (!domain) + return NULL; + if (first_irq > 0) { if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* - * Set the descriptor allocator to search for a - * 1-to-1 mapping, such as irq_alloc_desc_at(). - * Use of_node_to_nid() which is defined to - * numa_node_id() on platforms that have no custom - * implementation. - */ - irq_base = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); - if (irq_base < 0) { + /* attempt to allocated irq_descs */ + int rc = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); - irq_base = first_irq; - } - } else - irq_base = first_irq; - - return irq_domain_add_legacy(of_node, size, irq_base, 0, - ops, host_data); + } + WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); } - /* A linear domain is the default */ - return irq_domain_add_linear(of_node, size, ops, host_data); + return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -184,11 +170,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - pr_debug("Setting up legacy domain virq[%i:%i] ==> hwirq[%i:%i]\n", - first_irq, first_irq + size - 1, - (int)first_hwirq, (int)first_hwirq + size -1); - - domain = irq_domain_add_linear(of_node, first_hwirq + size, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; @@ -198,43 +180,6 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); -/** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain; - - domain = irq_domain_alloc(of_node, size, ops, host_data); - if (!domain) - return NULL; - - irq_domain_add(domain); - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, 0, ops, host_data); - if (domain) { - domain->revmap_direct_max_irq = max_irq ? max_irq : ~0; - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - /** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller -- cgit v1.2.3 From 1400ea86025a22862f97e7fe544433751b43ecec Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 6 Jun 2013 22:20:44 +0100 Subject: irqdomain: Beef up debugfs output This patch increases the amount of output produced by the irq_domain_mapping debugfs file by first listing all of the registered irq domains at the beginning of the output, and then by including all mapped IRQs in the output, not just the active ones. It is very useful when debugging irqdomain issues to be able to see the entire list of mapped irqs, not just the ones that happen to be connected to devices. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e0db59e2eef6..280b8047d8db 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -596,12 +596,29 @@ static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - void *data; + struct irq_domain *domain; + struct radix_tree_iter iter; + void *data, **slot; int i; - seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", + "name", "mapped", "linear-max", "direct-max", "devtree-node"); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, link) { + int count = 0; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) + count++; + seq_printf(m, "%c%-16s %6u %10u %10u %s\n", + domain == irq_default_domain ? '*' : ' ', domain->name, + domain->revmap_size + count, domain->revmap_size, + domain->revmap_direct_max_irq, + domain->of_node ? of_node_full_name(domain->of_node) : ""); + } + mutex_unlock(&irq_domain_mutex); + + seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "domain name"); + "active", "type", "domain"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -609,12 +626,15 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); + domain = desc->irq_data.domain; - if (desc->action && desc->action->handler) { + if (domain) { struct irq_chip *chip; + int hwirq = desc->irq_data.hwirq; + bool direct; seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + seq_printf(m, "0x%05x ", hwirq); chip = irq_desc_get_chip(desc); seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); @@ -622,6 +642,11 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); seq_printf(m, "%s\n", desc->irq_data.domain->name); } -- cgit v1.2.3 From d7f3e207397d7b4868e33d3f88396a06f4d5a8c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2013 16:19:52 -0700 Subject: rcu: Convert rcutree.c printk calls This commit converts printk() calls to the corresponding pr_*() calls. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 35380019f0fc..1009c0ccd4b1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { @@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) smp_processor_id(), (long)(jiffies - rsp->gp_start), rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) - printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) rcu_dump_cpu_stacks(rsp); @@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); -- cgit v1.2.3 From 6eaef633d77f50f031dd355ff5f91aaa1aaf9885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 10:08:37 -0700 Subject: rcu: Move code to apply callback-numbering simplifications The addition of callback numbering allows combining the detection of the ends of old grace periods and the beginnings of new grace periods. This commit moves code to set the stage for this combining. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 118 +++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1009c0ccd4b1..c36e52dc091d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -984,65 +984,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -/* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); - zero_cpu_stall_ticks(rdp); - } -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_new_gpnum(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - /* * Initialize the specified rcu_data structure's callback list to empty. */ @@ -1359,6 +1300,45 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat } } +/* + * Update CPU-local rcu_data state to record the newly noticed grace period. + * This is used both when we started the grace period and when we notice + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. + */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ + rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + zero_cpu_stall_ticks(rdp); + } +} + +static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + /* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp @@ -1381,6 +1361,26 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } +/* + * Did someone else start a new RCU grace period start since we last + * checked? Update local state appropriately if so. Must be called + * on the CPU corresponding to rdp. + */ +static int +check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + if (rdp->gpnum != rsp->gpnum) { + note_new_gpnum(rsp, rdp); + ret = 1; + } + local_irq_restore(flags); + return ret; +} + /* * Do per-CPU grace-period initialization for running CPU. The caller * must hold the lock of the leaf rcu_node structure corresponding to -- cgit v1.2.3 From 398ebe6000c16135d12ce2ff64318f306ffb20b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 10:53:14 -0700 Subject: rcu: Make __note_new_gpnum() check for ends of prior grace periods The current implementation can detect the beginning of a new grace period before noting the end of a previous grace period. Although the current implementation correctly handles this sort of nonsense, it would be good to reduce RCU's state space by making such nonsense unnecessary, which is now possible thanks to the fact that RCU's callback groups are now numbered. This commit therefore makes __note_new_gpnum() invoke __rcu_process_gp_end() in order to note the ends of prior grace periods before noting the beginnings of new grace periods. Of course, this now means that note_new_gpnum() notes both the beginnings and ends of grace periods, and could therefore be used in place of rcu_process_gp_end(). But that is a job for later commits. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c36e52dc091d..54aba759b609 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1309,6 +1309,9 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat */ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + /* Handle the ends of any preceding grace periods first. */ + __rcu_process_gp_end(rsp, rnp, rdp); + if (rdp->gpnum != rnp->gpnum) { /* * If the current grace period is waiting for this CPU, -- cgit v1.2.3 From d34ea3221a0f34ed42eadabf054604bbcc7ecd27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:10:43 -0700 Subject: rcu: Rename note_new_gpnum() to note_gp_changes() Because note_new_gpnum() now also checks for the ends of old grace periods, this commit changes its name to note_gp_changes(). Later commits will merge rcu_process_gp_end() into note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 54aba759b609..7eb2bc95300a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,7 +1307,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * ->lock of the leaf rcu_node structure corresponding to the current CPU, * and must have irqs disabled. */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Handle the ends of any preceding grace periods first. */ __rcu_process_gp_end(rsp, rnp, rdp); @@ -1326,19 +1326,20 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct } } -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_node *rnp; local_irq_save(flags); rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && + rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - __note_new_gpnum(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1377,7 +1378,7 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); + note_gp_changes(rsp, rdp); ret = 1; } local_irq_restore(flags); @@ -1396,7 +1397,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat __rcu_process_gp_end(rsp, rnp, rdp); /* Set state so that this CPU will detect the next quiescent state. */ - __note_new_gpnum(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); } /* -- cgit v1.2.3 From efc151c33b971148894789dc7c5589dec46d4348 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2013 16:24:11 -0700 Subject: rcu: Convert rcutree_plugin.h printk calls This commit converts printk() calls to the corresponding pr_*() calls. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8dd..207844ea0226 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5]; static void __init rcu_bootup_announce_oddness(void) { #ifdef CONFIG_RCU_TRACE - printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU debugfs-based tracing is enabled.\n"); #endif #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", CONFIG_RCU_FANOUT); #endif #ifdef CONFIG_RCU_FANOUT_EXACT - printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); #endif #ifdef CONFIG_RCU_FAST_NO_HZ - printk(KERN_INFO - "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); #endif #ifdef CONFIG_PROVE_RCU - printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); + pr_info("\tRCU lockdep checking is enabled.\n"); #endif #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); + pr_info("\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); + pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); #endif #if defined(CONFIG_RCU_CPU_STALL_INFO) - printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); + pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); + pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { @@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); + pr_info("Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) static void rcu_print_task_stall_begin(struct rcu_node *rnp) { - printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); } static void rcu_print_task_stall_end(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(KERN_CONT " P%d", t->pid); + pr_cont(" P%d", t->pid); ndetected++; } rcu_print_task_stall_end(); @@ -942,7 +941,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state; */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + pr_info("Hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -1883,7 +1882,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) /* Initiate the stall-info list. */ static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } /* @@ -1914,7 +1913,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -1925,7 +1924,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) /* Terminate the stall-info list. */ static void print_cpu_stall_info_end(void) { - printk(KERN_ERR "\t"); + pr_err("\t"); } /* Zero ->ticks_this_gp for all flavors of RCU. */ @@ -1948,17 +1947,17 @@ static void increment_cpu_stall_ticks(void) static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT " {"); + pr_cont(" {"); } static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { - printk(KERN_CONT " %d", cpu); + pr_cont(" %d", cpu); } static void print_cpu_stall_info_end(void) { - printk(KERN_CONT "} "); + pr_cont("} "); } static void zero_cpu_stall_ticks(struct rcu_data *rdp) -- cgit v1.2.3 From 470716fc043aba2fea832334e58d5cd5d82288a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:32:11 -0700 Subject: rcu: Switch callers from rcu_process_gp_end() to note_gp_changes() Because note_gp_changes() now incorporates rcu_process_gp_end() function, this commit switches to the former and eliminates the latter. In addition, this commit changes external calls from __rcu_process_gp_end() to __note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 31 +++---------------------------- kernel/rcutree_plugin.h | 2 +- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7eb2bc95300a..b04f134ab8bc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1343,28 +1343,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __rcu_process_gp_end(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * Did someone else start a new RCU grace period start since we last * checked? Update local state appropriately if so. Must be called @@ -1393,9 +1371,6 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Prior grace period ended, so advance callbacks for current CPU. */ - __rcu_process_gp_end(rsp, rnp, rdp); - /* Set state so that this CPU will detect the next quiescent state. */ __note_gp_changes(rsp, rnp, rdp); } @@ -1531,7 +1506,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); @@ -2276,7 +2251,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); /* Handle the end of a grace period that some other CPU ended. */ - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -2362,7 +2337,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); check_for_new_grace_period(rsp, rdp); /* Start a new grace period if one not already started. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea0226..f279148a0168 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1628,7 +1628,7 @@ static bool rcu_try_advance_all_cbs(void) */ if (rdp->completed != rnp->completed && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); if (cpu_has_callbacks_ready_to_invoke(rdp)) cbs_ready = true; -- cgit v1.2.3 From ba9fbe955f026780e6b27c279dba7c86dfdcb7d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 11:53:31 -0700 Subject: rcu: Merge __rcu_process_gp_end() into __note_gp_changes() This commit eliminates some duplicated code by merging __rcu_process_gp_end() into __note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 48 ++++++------------------------------------------ 1 file changed, 6 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b04f134ab8bc..ac8f03c41476 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1254,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods. The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled. */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Did another grace period end? */ + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { - /* No, so just accelerate recent callbacks. */ + /* No grace period end, so just accelerate recent callbacks. */ rcu_accelerate_cbs(rsp, rnp, rdp); } else { @@ -1276,41 +1274,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); - - /* - * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs handled on - * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. If another grace period - * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. Of course, any quiescent - * states we found for the old GP are now invalid. - */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { - rdp->gpnum = rdp->completed; - rdp->passed_quiesce = 0; - } - - /* - * If RCU does not need a quiescent state from this CPU, - * then make sure that this CPU doesn't go looking for one. - */ - if ((rnp->qsmask & rdp->grpmask) == 0) - rdp->qs_pending = 0; } -} - -/* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Handle the ends of any preceding grace periods first. */ - __rcu_process_gp_end(rsp, rnp, rdp); if (rdp->gpnum != rnp->gpnum) { /* -- cgit v1.2.3 From 63274cfb94aac109fc2490a70a96b26751608e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:21:29 -0700 Subject: rcu: Eliminate check_for_new_grace_period() wrapper function One of the calls to check_for_new_grace_period() is now redundant due to an immediately preceding call to note_gp_changes(). Eliminating this redundant call leaves a single caller, which is simpler if inlined. This commit therefore eliminates the redundant call and inlines the body of check_for_new_grace_period() into the single remaining call site. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac8f03c41476..b73014998b40 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,26 +1307,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_gp_changes(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - /* * Do per-CPU grace-period initialization for running CPU. The caller * must hold the lock of the leaf rcu_node structure corresponding to @@ -1749,8 +1729,10 @@ static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { /* If there is now a new grace period, record and return. */ - if (check_for_new_grace_period(rsp, rdp)) + if (rdp->gpnum != rsp->gpnum) { + note_gp_changes(rsp, rdp); return; + } /* * Does this CPU still need to do its part for current grace period? @@ -2302,7 +2284,6 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); - check_for_new_grace_period(rsp, rdp); /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { -- cgit v1.2.3 From ce3d9c03d1fa079678cc8df1517011e215517cda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:27:50 -0700 Subject: rcu: Inline trivial wrapper function rcu_start_gp_per_cpu() Given the changes that introduce note_gp_change(), rcu_start_gp_per_cpu() is now a trivial wrapper function with only one caller. This commit therefore inlines it into its sole call site. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b73014998b40..391bd724cd77 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1307,18 +1307,6 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -/* - * Do per-CPU grace-period initialization for running CPU. The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Set state so that this CPU will detect the next quiescent state. */ - __note_gp_changes(rsp, rnp, rdp); -} - /* * Initialize a new grace period. */ @@ -1367,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, -- cgit v1.2.3 From 05eb552bf5ed9e7277bdc9c273ed2f4e9b7dc3e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2013 12:38:24 -0700 Subject: rcu: Move redundant call to note_gp_changes() into called function The __rcu_process_callbacks() invokes note_gp_changes() immediately before invoking rcu_check_quiescent_state(), which conditionally invokes that same function. This commit therefore eliminates the call to note_gp_changes() in __rcu_process_callbacks() in favor of making unconditional to call from rcu_check_quiescent_state() to note_gp_changes(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 391bd724cd77..7a5194ef90da 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1716,11 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { - /* If there is now a new grace period, record and return. */ - if (rdp->gpnum != rsp->gpnum) { - note_gp_changes(rsp, rdp); - return; - } + /* Check for grace-period ends and beginnings. */ + note_gp_changes(rsp, rdp); /* * Does this CPU still need to do its part for current grace period? @@ -2184,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* Handle the end of a grace period that some other CPU ended. */ - note_gp_changes(rsp, rdp); - /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); -- cgit v1.2.3 From 9a5739d73f9369ba1cdba3889ee4e2f87be25a46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2013 20:48:36 -0700 Subject: rcu: Remove "Experimental" flags After a release or two, features are no longer experimental. Therefore, this commit removes the "Experimental" tag from them. Reported-by: Paul Gortmaker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea0226..6b3ccaae93ab 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -81,7 +81,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - pr_info("\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU @@ -91,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void) have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tExperimental no-CBs CPU 0\n"); + pr_info("\tOffload RCU callbacks from CPU 0\n"); cpumask_set_cpu(0, rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tExperimental no-CBs for all CPUs\n"); + pr_info("\tOffload RCU callbacks from all CPUs\n"); cpumask_setall(rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) - pr_info("\tExperimental polled no-CBs CPUs.\n"); + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); } #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } -- cgit v1.2.3 From 026ad2835ce6202069e7aa0b11f5f1be4de34550 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Apr 2013 22:14:11 -0700 Subject: rcu: Drive quiescent-state-forcing delay from HZ Systems with HZ=100 can have slow bootup times due to the default three-jiffy delays between quiescent-state forcing attempts. This commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value based on the value of HZ. However, this would break very large systems that require more time between quiescent-state forcing attempts. This commit therefore also ups the default delay by one jiffy for each 256 CPUs that might be on the system (based off of nr_cpu_ids at runtime, -not- NR_CPUS at build time). Updated to collapse #ifdefs for RCU_JIFFIES_TILL_FORCE_QS into a step-function definition as suggested by Josh Triplett. Reported-by: Paul Mackerras Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 18 ++++++++++++++++-- kernel/rcutree.h | 15 ++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1009c0ccd4b1..f344d3c824a4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -218,8 +218,8 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); @@ -3265,11 +3265,25 @@ static void __init rcu_init_one(struct rcu_state *rsp, */ static void __init rcu_init_geometry(void) { + ulong d; int i; int j; int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; + /* + * Initialize any unspecified boot parameters. + * The default values of jiffies_till_first_fqs and + * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS + * value, which is a function of HZ, then adding one for each + * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. + */ + d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + if (jiffies_till_first_fqs == ULONG_MAX) + jiffies_till_first_fqs = d; + if (jiffies_till_next_fqs == ULONG_MAX) + jiffies_till_next_fqs = d; + /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4df503470e42..4a39d364493c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -343,12 +343,17 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) + /* For jiffies_till_first_fqs and */ + /* and jiffies_till_next_fqs. */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ +#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ + /* delay between bouts of */ + /* quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ + /* at least one scheduling clock */ + /* irq before ratting on them. */ #define rcu_wait(cond) \ do { \ -- cgit v1.2.3 From 4982969d965ec87b1887c86d2e0b3d81065e1d38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2013 16:56:53 -0700 Subject: rcu: Merge adjacent identical ifdefs Two ifdefs in kernel/rcupdate.c now have identical conditions with nothing between them, so the commit merges them into a single ifdef. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4c..faeea984dbaa 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -145,9 +145,6 @@ static struct lock_class_key rcu_sched_lock_key; struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC int debug_lockdep_rcu_enabled(void) { -- cgit v1.2.3 From 99f88919f8fa8a8b01b5306c59c9977b94604df8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Mar 2013 16:54:14 -0700 Subject: rcu: Remove srcu_read_lock_raw() and srcu_read_unlock_raw(). These interfaces never did get used, so this commit removes them, their rcutorture tests, and documentation referencing them. Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c96724..b1fa5510388d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = { .name = "srcu_sync" }; -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - -static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw_sync" -}; - static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -1983,7 +1945,6 @@ rcu_torture_init(void) { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &srcu_raw_ops, &srcu_raw_sync_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); -- cgit v1.2.3 From 127781d1ba1ee5bbe1780afa35dd0e71583b143d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 08:44:00 -0700 Subject: rcu: Remove TINY_PREEMPT_RCU TINY_PREEMPT_RCU adds significant code and complexity, but does not offer commensurate benefits. People currently using TINY_PREEMPT_RCU can get much better memory footprint with TINY_RCU, or, if they really need preemptible RCU, they can use TREE_PREEMPT_RCU with a relatively minor degradation in memory footprint. Please note that this move has been widely publicized on LKML (https://lkml.org/lkml/2012/11/12/545) and on LWN (http://lwn.net/Articles/541037/). This commit therefore removes TINY_PREEMPT_RCU. Signed-off-by: Paul E. McKenney [ paulmck: Updated to eliminate #else in rcutiny.h as suggested by Josh ] Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 854 ------------------------------------------------ 1 file changed, 854 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002faeb..29a4dd78c8bf 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,763 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -#ifdef CONFIG_TINY_PREEMPT_RCU - -#include - -/* Global control variables for preemptible RCU. */ -struct rcu_preempt_ctrlblk { - struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ - struct rcu_head **nexttail; - /* Tasks blocked in a preemptible RCU */ - /* read-side critical section while an */ - /* preemptible-RCU grace period is in */ - /* progress must wait for a later grace */ - /* period. This pointer points to the */ - /* ->next pointer of the last task that */ - /* must wait for a later grace period, or */ - /* to &->rcb.rcucblist if there is no */ - /* such task. */ - struct list_head blkd_tasks; - /* Tasks blocked in RCU read-side critical */ - /* section. Tasks are placed at the head */ - /* of this list and age towards the tail. */ - struct list_head *gp_tasks; - /* Pointer to the first task blocking the */ - /* current grace period, or NULL if there */ - /* is no such task. */ - struct list_head *exp_tasks; - /* Pointer to first task blocking the */ - /* current expedited grace period, or NULL */ - /* if there is no such task. If there */ - /* is no current expedited grace period, */ - /* then there cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST - struct list_head *boost_tasks; - /* Pointer to first task that needs to be */ - /* priority-boosted, or NULL if no priority */ - /* boosting is needed. If there is no */ - /* current or expedited grace period, there */ - /* can be no such task. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - u8 gpnum; /* Current grace period. */ - u8 gpcpu; /* Last grace period blocked by the CPU. */ - u8 completed; /* Last grace period completed. */ - /* If all three are equal, RCU is idle. */ -#ifdef CONFIG_RCU_BOOST - unsigned long boost_time; /* When to start boosting (jiffies) */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_TRACE - unsigned long n_grace_periods; -#ifdef CONFIG_RCU_BOOST - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { - .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), - RCU_TRACE(.rcb.name = "rcu_preempt") -}; - -static int rcu_preempted_readers_exp(void); -static void rcu_report_exp_done(void); - -/* - * Return true if the CPU has not yet responded to the current grace period. - */ -static int rcu_cpu_blocking_cur_gp(void) -{ - return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Check for a running RCU reader. Because there is only one CPU, - * there can be but one running RCU reader at a time. ;-) - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section. - */ -static int rcu_preempt_running_reader(void) -{ - return current->rcu_read_lock_nesting; -} - -/* - * Check for preempted RCU readers blocking any grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_any(void) -{ - return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); -} - -/* - * Check for preempted RCU readers blocking the current grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_cgp(void) -{ - return rcu_preempt_ctrlblk.gp_tasks != NULL; -} - -/* - * Return true if another preemptible-RCU grace period is needed. - */ -static int rcu_preempt_needs_another_gp(void) -{ - return *rcu_preempt_ctrlblk.rcb.curtail != NULL; -} - -/* - * Return true if a preemptible-RCU grace period is in progress. - * The caller must disable hardirqs. - */ -static int rcu_preempt_gp_in_progress(void) -{ - return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t) -{ - struct list_head *np; - - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; - return np; -} - -#ifdef CONFIG_RCU_TRACE - -#ifdef CONFIG_RCU_BOOST -static void rcu_initiate_boost_trace(void); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Dump additional statistice for TINY_PREEMPT_RCU. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ - seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", - rcu_preempt_ctrlblk.rcb.qlen, - rcu_preempt_ctrlblk.n_grace_periods, - rcu_preempt_ctrlblk.gpnum, - rcu_preempt_ctrlblk.gpcpu, - rcu_preempt_ctrlblk.completed, - "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], - "N."[!rcu_preempt_ctrlblk.gp_tasks], - "E."[!rcu_preempt_ctrlblk.exp_tasks]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", - " ", - "B."[!rcu_preempt_ctrlblk.boost_tasks], - rcu_preempt_ctrlblk.n_tasks_boosted, - rcu_preempt_ctrlblk.n_exp_boosts, - rcu_preempt_ctrlblk.n_normal_boosts, - (int)(jiffies & 0xffff), - (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", - " balk", - rcu_preempt_ctrlblk.n_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, - rcu_preempt_ctrlblk.n_balk_boost_tasks, - rcu_preempt_ctrlblk.n_balk_notyet, - rcu_preempt_ctrlblk.n_balk_nos); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO - -/* Controls for rcu_kthread() kthread. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; - -/* - * Carry out RCU priority boosting on the task indicated by ->boost_tasks, - * and advance ->boost_tasks to the next task in the ->blkd_tasks list. - */ -static int rcu_boost(void) -{ - unsigned long flags; - struct rt_mutex mtx; - struct task_struct *t; - struct list_head *tb; - - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - return 0; /* Nothing to boost. */ - - local_irq_save(flags); - - /* - * Recheck with irqs disabled: all tasks in need of boosting - * might exit their RCU read-side critical sections on their own - * if we are preempted just before disabling irqs. - */ - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - local_irq_restore(flags); - return 0; - } - - /* - * Preferentially boost tasks blocking expedited grace periods. - * This cannot starve the normal grace periods because a second - * expedited grace period must boost all blocked tasks, including - * those blocking the pre-existing normal grace period. - */ - if (rcu_preempt_ctrlblk.exp_tasks != NULL) { - tb = rcu_preempt_ctrlblk.exp_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else { - tb = rcu_preempt_ctrlblk.boost_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); - } - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - - /* - * We boost task t by manufacturing an rt_mutex that appears to - * be held by task t. We leave a pointer to that rt_mutex where - * task t can find it, and task t will release the mutex when it - * exits its outermost RCU read-side critical section. Then - * simply acquiring this artificial rt_mutex will boost task - * t's priority. (Thanks to tglx for suggesting this approach!) - */ - t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; - local_irq_restore(flags); - rt_mutex_lock(&mtx); - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - - return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || - ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; -} - -/* - * Check to see if it is now time to start boosting RCU readers blocking - * the current grace period, and, if so, tell the rcu_kthread_task to - * start boosting them. If there is an expedited boost in progress, - * we wait for it to complete. - * - * If there are no blocked readers blocking the current grace period, - * return 0 to let the caller know, otherwise return 1. Note that this - * return value is independent of whether or not boosting was done. - */ -static int rcu_initiate_boost(void) -{ - if (!rcu_preempt_blocked_readers_cgp() && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); - return 0; - } - if (rcu_preempt_ctrlblk.exp_tasks != NULL || - (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { - if (rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_callbacks(); - } else { - RCU_TRACE(rcu_initiate_boost_trace()); - } - return 1; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(void) -{ - rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * If there is no RCU priority boosting, we don't initiate boosting, - * but we do indicate whether there are blocked readers blocking the - * current grace period. - */ -static int rcu_initiate_boost(void) -{ - return rcu_preempt_blocked_readers_cgp(); -} - -/* - * If there is no RCU priority boosting, nothing to do at grace-period start. - */ -static void rcu_preempt_boost_start_gp(void) -{ -} - -#endif /* else #ifdef CONFIG_RCU_BOOST */ - -/* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - * - * Because this is a single-CPU implementation, the only way a grace - * period can end is if the CPU is in a quiescent state. The reason is - * that a blocked preemptible-RCU reader can exit its critical section - * only if the CPU is running it at the time. Therefore, when the - * last task blocking the current grace period exits its RCU read-side - * critical section, neither the CPU nor blocked tasks will be stopping - * the current grace period. (In contrast, SMP implementations - * might have CPUs running in RCU read-side critical sections that - * block later grace periods -- but this is not possible given only - * one CPU.) - */ -static void rcu_preempt_cpu_qs(void) -{ - /* Record both CPU and task as having responded to current GP. */ - rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - - /* If there is no GP then there is nothing more to do. */ - if (!rcu_preempt_gp_in_progress()) - return; - /* - * Check up on boosting. If there are readers blocking the - * current grace period, leave. - */ - if (rcu_initiate_boost()) - return; - - /* Advance callbacks. */ - rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; - rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; - - /* If there are no blocked readers, next GP is done instantly. */ - if (!rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - - /* If there are done callbacks, cause them to be invoked. */ - if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_callbacks(); -} - -/* - * Start a new RCU grace period if warranted. Hard irqs must be disabled. - */ -static void rcu_preempt_start_gp(void) -{ - if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { - - /* Official start of GP. */ - rcu_preempt_ctrlblk.gpnum++; - RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); - reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); - - /* Any blocked RCU readers block new GP. */ - if (rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.gp_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - - /* Set up for RCU priority boosting. */ - rcu_preempt_boost_start_gp(); - - /* If there is no running reader, CPU is done with GP. */ - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); - } -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from. If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * If the task started after the current grace period began, as recorded - * by ->gpcpu, we enqueue at the beginning of the list. Otherwise - * before the element referenced by ->gp_tasks (or at the tail if - * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section. Therefore, the current grace period - * cannot be permitted to complete until the ->gp_tasks pointer becomes - * NULL. - * - * Caller must disable preemption. - */ -void rcu_preempt_note_context_switch(void) -{ - struct task_struct *t = current; - unsigned long flags; - - local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - - /* Possibly blocking in an RCU read-side critical section. */ - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - - /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. - */ - list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); - if (rcu_cpu_blocking_cur_gp()) - rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; - } else if (rcu_preempt_running_reader() < 0 && - t->rcu_read_unlock_special) { - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - } - - /* - * Either we were not in an RCU read-side critical section to - * begin with, or we have now recorded that critical section - * globally. Either way, we can now note a quiescent state - * for this CPU. Again, if we were in an RCU read-side critical - * section, and if that critical section was blocking the current - * grace period, then the fact that the task has been enqueued - * means that current grace period continues to be blocked. - */ - rcu_preempt_cpu_qs(); - local_irq_restore(flags); -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ - int empty; - int empty_exp; - unsigned long flags; - struct list_head *np; -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - int special; - - /* - * NMI handlers cannot block and cannot safely manipulate state. - * They therefore cannot possibly be special, so just leave. - */ - if (in_nmi()) - return; - - local_irq_save(flags); - - /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. - */ - special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) - rcu_preempt_cpu_qs(); - - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { - local_irq_restore(flags); - return; - } - - /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - - /* - * Remove this task from the ->blkd_tasks list and adjust - * any pointers that might have been referencing it. - */ - empty = !rcu_preempt_blocked_readers_cgp(); - empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = rcu_next_node_entry(t); - list_del_init(&t->rcu_node_entry); - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) - rcu_preempt_ctrlblk.gp_tasks = np; - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) - rcu_preempt_ctrlblk.exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) - rcu_preempt_ctrlblk.boost_tasks = np; -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the current list, and if - * we aren't waiting on the CPU, report the quiescent state - * and start a new grace period if needed. - */ - if (!empty && !rcu_preempt_blocked_readers_cgp()) { - rcu_preempt_cpu_qs(); - rcu_preempt_start_gp(); - } - - /* - * If this was the last task on the expedited lists, - * then we need wake up the waiting task. - */ - if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_report_exp_done(); - } -#ifdef CONFIG_RCU_BOOST - /* Unboost self if was boosted. */ - if (t->rcu_boost_mutex != NULL) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - rt_mutex_unlock(rbmp); - } -#endif /* #ifdef CONFIG_RCU_BOOST */ - local_irq_restore(flags); -} - -/* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the rcu_preempt_ctrlblk structure, which is - * checked elsewhere. This is called from the scheduling-clock interrupt. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(void) -{ - struct task_struct *t = current; - - if (rcu_preempt_gp_in_progress() && - (!rcu_preempt_running_reader() || - !rcu_cpu_blocking_cur_gp())) - rcu_preempt_cpu_qs(); - if (&rcu_preempt_ctrlblk.rcb.rcucblist != - rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_callbacks(); - if (rcu_preempt_gp_in_progress() && - rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader() > 0) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -/* - * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from rcu_process_callbacks() to - * handle that case. Of course, it is invoked for all flavors of - * RCU, but RCU callbacks can appear only on one of the lists, and - * neither ->nexttail nor ->donetail can possibly be NULL, so there - * is no need for an explicit check. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ - if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) - rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; -} - -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); -} - -/* - * Queue a preemptible -RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcu_preempt_ctrlblk.nexttail = head; - rcu_preempt_ctrlblk.nexttail = &head->next; - RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); - rcu_preempt_start_gp(); /* checks to see if GP needed. */ - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (!rcu_scheduler_active) - return; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - if (!rcu_preempt_blocked_readers_any()) - return; - - /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - if (rcu_expedited) - synchronize_rcu_expedited(); - else - rcu_barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(void) -{ - return rcu_preempt_ctrlblk.exp_tasks != NULL; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. - */ -static void rcu_report_exp_done(void) -{ - wake_up(&sync_rcu_preempt_exp_wq); -} - -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to rely in the fact that there is but one CPU, and that it is - * illegal for a task to invoke synchronize_rcu_expedited() while in a - * preemptible-RCU read-side critical section. Therefore, any such - * critical sections must correspond to blocked tasks, which must therefore - * be on the ->blkd_tasks list. So just record the current head of the - * list in the ->exp_tasks pointer, and wait for all tasks including and - * after the task pointed to by ->exp_tasks to drain. - */ -void synchronize_rcu_expedited(void) -{ - unsigned long flags; - struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; - unsigned long snap; - - barrier(); /* ensure prior action seen before grace period. */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - - /* - * Acquire lock so that there is only one preemptible RCU grace - * period in flight. Of course, if someone does the expedited - * grace period for us while we are acquiring the lock, just leave. - */ - snap = sync_rcu_preempt_exp_count + 1; - mutex_lock(&sync_rcu_preempt_exp_mutex); - if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) - goto unlock_mb_ret; /* Others did our work for us. */ - - local_irq_save(flags); - - /* - * All RCU readers have to already be on blkd_tasks because - * we cannot legally be executing in an RCU read-side critical - * section. - */ - - /* Snapshot current head of ->blkd_tasks list. */ - rpcp->exp_tasks = rpcp->blkd_tasks.next; - if (rpcp->exp_tasks == &rpcp->blkd_tasks) - rpcp->exp_tasks = NULL; - - /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) { - local_irq_restore(flags); - } else { - rcu_initiate_boost(); - local_irq_restore(flags); - wait_event(sync_rcu_preempt_exp_wq, - !rcu_preempted_readers_exp()); - } - - /* Clean up and exit. */ - barrier(); /* ensure expedited GP seen before counter increment. */ - sync_rcu_preempt_exp_count++; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); - barrier(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* - * Does preemptible RCU need the CPU to stay out of dynticks mode? - */ -int rcu_preempt_needs_cpu(void) -{ - return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; -} - -#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ - #ifdef CONFIG_RCU_TRACE /* @@ -895,79 +138,6 @@ static void rcu_preempt_process_callbacks(void) { } -#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_callbacks(void) -{ - have_rcu_kthread_work = 1; - if (rcu_kthread_task != NULL) - wake_up(&rcu_kthread_wq); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) -{ - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) - rcu_process_callbacks(NULL); - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ -} - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - /* Hold off callback invocation until early_initcall() time. */ static int rcu_scheduler_fully_active __read_mostly; @@ -1001,8 +171,6 @@ static int __init rcu_scheduler_really_started(void) } early_initcall(rcu_scheduler_really_started); -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC #include @@ -1020,25 +188,6 @@ void __init rcu_scheduler_starting(void) #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_BOOST - -static void rcu_initiate_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_balk_blkd_tasks++; - else if (rcu_preempt_ctrlblk.gp_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; - else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_balk_boost_tasks++; - else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_balk_notyet++; - else - rcu_preempt_ctrlblk.n_balk_nos++; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; @@ -1105,9 +254,6 @@ MODULE_LICENSE("GPL"); static void check_cpu_stall_preempt(void) { -#ifdef CONFIG_TINY_PREEMPT_RCU - check_cpu_stall(&rcu_preempt_ctrlblk.rcb); -#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 221304e95e1466fb49b630f67a719cc735ec5353 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 08:59:52 -0700 Subject: rcu: Remove show_tiny_preempt_stats() With the removal of CONFIG_TINY_PREEMPT_RCU, show_tiny_preempt_stats() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 29a4dd78c8bf..cf0bc22434c0 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,18 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -#ifdef CONFIG_RCU_TRACE - -/* - * Because preemptible RCU does not exist, it is not necessary to - * dump out its statistics. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -202,7 +190,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) */ static int show_tiny_stats(struct seq_file *m, void *unused) { - show_tiny_preempt_stats(m); seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); return 0; -- cgit v1.2.3 From 9acaac8ced57be8312cbf9f2a1e4f5e23b363493 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:02:40 -0700 Subject: rcu: Remove rcu_preempt_check_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_check_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d7..91782827775b 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -257,7 +257,6 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); - rcu_preempt_check_callbacks(); } /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index cf0bc22434c0..404b3a31e517 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(void) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to remove. -- cgit v1.2.3 From 47d65935a7f26f24417585e872e254c7ecc6596f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:05:34 -0700 Subject: rcu: Remove rcu_preempt_remove_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_remove_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 91782827775b..6f5a2a6cc63f 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -289,7 +289,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; - rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 404b3a31e517..8b835b98114c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to remove. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -} - /* * Because preemptible RCU does not exist, it never has any callbacks * to process. -- cgit v1.2.3 From 58c4e69d43df91fd6a55bc070474aad6b7cfb18d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 09:11:12 -0700 Subject: rcu: Remove rcu_preempt_process_callbacks() With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_process_callbacks() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 1 - kernel/rcutiny_plugin.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 6f5a2a6cc63f..7fc2339b0859 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -314,7 +314,6 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); } /* diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8b835b98114c..bfe992407803 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,14 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - /* Hold off callback invocation until early_initcall() time. */ static int rcu_scheduler_fully_active __read_mostly; -- cgit v1.2.3 From 9dc5ad32488a75504349372330cc228d4dd678db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:11:15 -0700 Subject: rcu: Simplify RCU_TINY RCU callback invocation TINY_PREEMPT_RCU could use a kthread to handle RCU callback invocation, which required an API to abstract kthread vs. softirq invocation. Now that TINY_PREEMPT_RCU is no longer with us, this commit retires this API in favor of direct use of the relevant softirq primitives. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 14 +++++++++----- kernel/rcutiny_plugin.h | 33 --------------------------------- 2 files changed, 9 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7fc2339b0859..4adc9e26da34 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,7 +44,6 @@ /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void); static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, @@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -277,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); return; } @@ -307,7 +306,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); } static void rcu_process_callbacks(struct softirq_action *unused) @@ -379,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index bfe992407803..36fd83c544c8 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -102,39 +102,6 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall_preempt()); } -/* Hold off callback invocation until early_initcall() time. */ -static int rcu_scheduler_fully_active __read_mostly; - -/* - * Start up softirq processing of callbacks. - */ -void invoke_rcu_callbacks(void) -{ - if (rcu_scheduler_fully_active) - raise_softirq(RCU_SOFTIRQ); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static int __init rcu_scheduler_really_started(void) -{ - rcu_scheduler_fully_active = 1; - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ - return 0; -} -early_initcall(rcu_scheduler_really_started); - #ifdef CONFIG_DEBUG_LOCK_ALLOC #include -- cgit v1.2.3 From 4879c84daa7bd6757b99ef76b30d4fcebccfcc6f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:18:04 -0700 Subject: rcu: Remove check_cpu_stall_preempt() With the removal of CONFIG_TINY_PREEMPT_RCU, check_cpu_stall_preempt() is now an empty function. This commit therefore eliminates it by inlining it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 36fd83c544c8..bac3a6ecb991 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -82,8 +82,6 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } -static void check_cpu_stall_preempt(void); - #endif /* #ifdef CONFIG_RCU_TRACE */ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) @@ -99,7 +97,6 @@ static void check_cpu_stalls(void) { RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); - RCU_TRACE(check_cpu_stall_preempt()); } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -182,8 +179,4 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); -static void check_cpu_stall_preempt(void) -{ -} - #endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 318bdcd95938ec3a530fc789da662ce159d50d46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2013 10:43:02 -0700 Subject: rcu: Consolidate rcutiny_plugin.h ifdefs This commit rearranges code in order to allow ifdefs to be consolidated in kernel/rcutiny_plugin.h, simplifying the code. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 86 +++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index bac3a6ecb991..65ef1800f4fd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -53,54 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC +#include + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = rcp->jiffies_stall; - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ -#ifdef CONFIG_RCU_TRACE - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include /* * During boot, we forgive RCU lockdep issues. After this function is @@ -179,4 +135,42 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + #endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); +} -- cgit v1.2.3 From 2439b696cb5303f1eeb6aeebcee19e0056c3dd6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2013 10:15:52 -0700 Subject: rcu: Shrink TINY_RCU by moving exit_rcu() Now that TINY_PREEMPT_RCU is no more, exit_rcu() is always an empty function. But if TINY_RCU is going to have an empty function, it should be in include/linux/rcutiny.h, where it does not bloat the kernel. This commit therefore moves exit_rcu() out of kernel/rcupdate.c to kernel/rcutree_plugin.h, and places a static inline empty function in include/linux/rcutiny.h in order to shrink TINY_RCU a bit. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 26 +------------------------- kernel/rcutree_plugin.h | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4c..0be1fa2ea521 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -104,31 +104,7 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (likely(list_empty(¤t->rcu_node_entry))) - return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; - __rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 207844ea0226..de701bbdb624 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -932,6 +932,24 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -1100,6 +1118,14 @@ static void __init __rcu_init_preempt(void) { } +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST -- cgit v1.2.3 From 14961444696effb2e660fe876e5c1880f8bc3932 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Apr 2013 07:49:22 -0700 Subject: rcu: Shrink TINY_RCU by reworking CPU-stall ifdefs TINY_RCU's reset_cpu_stall_ticks() and check_cpu_stalls() functions are defined unconditionally, and are empty functions if CONFIG_RCU_TRACE is disabled (which in turns disables detection of RCU CPU stalls). This commit saves a few lines of source code by defining these functions only if CONFIG_RCU_TRACE=y. Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++-- kernel/rcutiny_plugin.h | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4adc9e26da34..aa344111de3e 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -204,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - reset_cpu_stall_ticks(rcp); + RCU_TRACE(reset_cpu_stall_ticks(rcp)); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,7 +251,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { - check_cpu_stalls(); + RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 65ef1800f4fd..0cd385acccfa 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -158,15 +158,11 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } -#endif /* #ifdef CONFIG_RCU_TRACE */ - static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { -#ifdef CONFIG_RCU_TRACE rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ } static void check_cpu_stalls(void) @@ -174,3 +170,5 @@ static void check_cpu_stalls(void) RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); } + +#endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From ee23871389d51e07380d23887333622fbe7d3dd9 Mon Sep 17 00:00:00 2001 From: Ivo Sieben Date: Mon, 3 Jun 2013 12:12:02 +0200 Subject: genirq: Set irq thread to RT priority on creation When a threaded irq handler is installed the irq thread is initially created on normal scheduling priority. Only after the irq thread is woken up it sets its priority to RT_FIFO MAX_USER_RT_PRIO/2 itself. This means that interrupts that occur directly after the irq handler is installed will be handled on a normal scheduling priority instead of the realtime priority that one would expect. Fix this by setting the RT priority on creation of the irq_thread. Signed-off-by: Ivo Sieben Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1370254322-17240-1-git-send-email-meltedpianoman@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65a..e16caa81f887 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused) static int irq_thread(void *data) { struct callback_head on_exit_work; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, @@ -854,8 +851,6 @@ static int irq_thread(void *data) else handler_fn = irq_thread_fn; - sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); @@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread_fn && !nested) { struct task_struct *t; + static const struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); @@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = PTR_ERR(t); goto out_mput; } + + sched_setscheduler(t, SCHED_FIFO, ¶m); + /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code -- cgit v1.2.3 From 9350de06be45a5a8b927ac6577c9d35de61c90ca Mon Sep 17 00:00:00 2001 From: Bernie Thompson Date: Sat, 1 Jun 2013 00:47:43 +0000 Subject: PM / wakeup: Adjust messaging for wake events during suspend This adds in a new message to the wakeup code which adds an indication to the log that suspend was cancelled due to a wake event occouring during the suspend sequence. It also adjusts the message printed in suspend.c to reflect the potential that a suspend was aborted, as opposed to a device failing to suspend. Without these message adjustments one can end up with a kernel log that says that a device failed to suspend with no actual device suspend failures, which can be confusing to the log examiner. Signed-off-by: Bernie Thompson Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb2..ece04223bb1e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); -- cgit v1.2.3 From ad71d889b88055e61e3970a6744a271a51a94f42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 15:46:14 -0400 Subject: tracing: Add function probe to trigger a ftrace dump to console Add the "dump" command to have the ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. Format is: :dump echo 'bad_address:dump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:dump' > /debug/tracing/set_ftrace_filter Requested-by: Luis Claudio R. Goncalves Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 59 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d7191988..d7c8719734b8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,6 +290,13 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) trace_dump_stack(STACK_SKIP); } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -327,6 +334,13 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("stacktrace", m, ip, data); } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -342,6 +356,11 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { .print = ftrace_stacktrace_print, }; +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -425,6 +444,19 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, param, enable); } +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -440,6 +472,11 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { .func = ftrace_stacktrace_callback, }; +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -450,13 +487,25 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; ret = register_ftrace_command(&ftrace_stacktrace_cmd); - if (ret) { - unregister_ftrace_command(&ftrace_traceoff_cmd); - unregister_ftrace_command(&ftrace_traceon_cmd); - } + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + return 0; + + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else -- cgit v1.2.3 From 90e3c03c3a09a7b176b3fe59d78f5d9755ac8e37 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Apr 2013 19:00:46 -0400 Subject: tracing: Add function probe to trigger a ftrace dump of current CPU trace Add the "cpudump" command to have the current CPU ftrace buffer dumped to console if a function is hit. This is useful when debugging a tripple fault, where you have an idea of a function that is called just before the tripple fault occurs, and can tell ftrace to dump its content out to the console before it continues. This differs from the "dump" command as it only dumps the content of the ring buffer for the currently executing CPU, and does not show the contents of the other CPUs. Format is: :cpudump echo 'bad_address:cpudump' > /debug/tracing/set_ftrace_filter To remove this: echo '!bad_address:cpudump' > /debug/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d7c8719734b8..b863f93b30f3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -297,6 +297,14 @@ ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) ftrace_dump(DUMP_ALL); } +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -341,6 +349,13 @@ ftrace_dump_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("dump", m, ip, data); } +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -361,6 +376,11 @@ static struct ftrace_probe_ops dump_probe_ops = { .print = ftrace_dump_print, }; +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -457,6 +477,19 @@ ftrace_dump_callback(struct ftrace_hash *hash, "1", enable); } +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -477,6 +510,11 @@ static struct ftrace_func_command ftrace_dump_cmd = { .func = ftrace_dump_callback, }; +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -497,8 +535,14 @@ static int __init init_func_cmd_traceon(void) if (ret) goto out_free_stacktrace; + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + return 0; + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); out_free_stacktrace: unregister_ftrace_command(&ftrace_stacktrace_cmd); out_free_traceon: -- cgit v1.2.3 From 8092e808a31839c502a52d391b15f31c1d8764f5 Mon Sep 17 00:00:00 2001 From: Harsh Prateek Bora Date: Fri, 24 May 2013 12:52:17 +0530 Subject: tracing/trivial: Consolidate error return condition Consolidate the checks for !enabled and !param to return -EINVAL in event_enable_func(). Link: http://lkml.kernel.org/r/1369380137-12452-1-git-send-email-harsh@linux.vnet.ibm.com Signed-off-by: Harsh Prateek Bora Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4bf..db086f172cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2011,10 +2011,7 @@ event_enable_func(struct ftrace_hash *hash, int ret; /* hash funcs only work with set_ftrace_filter */ - if (!enabled) - return -EINVAL; - - if (!param) + if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); -- cgit v1.2.3 From 238ae93d699d59876b470bf6455de22bcfaa9a1b Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Sun, 26 May 2013 16:52:01 +0800 Subject: tracing: Fix file mode of free_buffer Commit 4f271a2a60c748599b30bb4dafff30d770439b96 (tracing: Add a proc file to stop tracing and free buffer) implement a method to free up ring buffer in kernel memory in the release code path of free_buffer's fd. Then we don't need read/write support for free_buffer, indeed we just have a dummy write fop, and don't implement read fop. So the 0200 is more reasonable file mode for free_buffer than the current file mode 0644. Link: http://lkml.kernel.org/r/20130526085201.GA3183@udknight Acked-by: Vaibhav Nagarnaik Acked-by: David Sharp Signed-off-by: Wang YanQing Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..5f4a09c12e0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5935,7 +5935,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, -- cgit v1.2.3 From 7614c3dc74733dff4b0e774f7a894b9ea6ec508c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 20:01:16 -0400 Subject: ftrace: Use schedule_on_each_cpu() as a heavy synchronize_sched() The function tracer uses preempt_disable/enable_notrace() for synchronization between reading registered ftrace_ops and unregistering them. Most of the ftrace_ops are global permanent structures that do not require this synchronization. That is, ops may be added and removed from the hlist but are never freed, and wont hurt if a synchronization is missed. But this is not true for dynamically created ftrace_ops or control_ops, which are used by the perf function tracing. The problem here is that the function tracer can be used to trace kernel/user context switches as well as going to and from idle. Basically, it can be used to trace blind spots of the RCU subsystem. This means that even though preempt_disable() is done, a synchronize_sched() will ignore CPUs that haven't made it out of user space or idle. These can include functions that are being traced just before entering or exiting the kernel sections. To implement the RCU synchronization, instead of using synchronize_sched() the use of schedule_on_each_cpu() is performed. This means that when a dynamically allocated ftrace_ops, or a control ops is being unregistered, all CPUs must be touched and execute a ftrace_sync() stub function via the work queues. This will rip CPUs out from idle or in dynamic tick mode. This only happens when a user disables perf function tracing or other dynamically allocated function tracers, but it allows us to continue to debug RCU and context tracking with function tracing. Link: http://lkml.kernel.org/r/1369785676.15552.55.camel@gandalf.local.home Cc: "Paul E. McKenney" Cc: Tejun Heo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..800a8a2fbddb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } -- cgit v1.2.3 From aaf6ac0f0871cb7fc0f28f3a00edf329bc7adc29 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 7 Jun 2013 15:07:48 +0900 Subject: tracing: Do not call kmem_cache_free() on allocation failure There's no point calling it when _alloc() failed. Link: http://lkml.kernel.org/r/1370585268-29169-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db086f172cf5..f57b01574a30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,7 +97,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; + return -ENOMEM; field->name = name; field->type = type; @@ -114,11 +114,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - kmem_cache_free(field_cachep, field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, -- cgit v1.2.3 From 11682a41618f8094cb7a9330b4b6a12ffaef5774 Mon Sep 17 00:00:00 2001 From: Marcus Gelderie Date: Tue, 4 Jun 2013 09:32:09 +0200 Subject: alarmtimer: Export symbols of functions declared in linux/alarmtimer.h Export symbols so they can be used by drivers/staging/android/alarm-dev.c if it is built as a module. So far alarm-dev is built-in but module support is planned (see drivers/staging/android/TODO). Signed-off-by: Marcus Gelderie [jstultz: tweaked commit message, also export newly added functions] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 3e5cba274475..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -204,6 +204,7 @@ ktime_t alarm_expires_remaining(const struct alarm *alarm) struct alarm_base *base = &alarm_bases[alarm->type]; return ktime_sub(alarm->node.expires, base->gettime()); } +EXPORT_SYMBOL_GPL(alarm_expires_remaining); #ifdef CONFIG_RTC_CLASS /** @@ -309,6 +310,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** * alarm_start - Sets an absolute alarm to fire @@ -329,6 +331,7 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); /** * alarm_start_relative - Sets a relative alarm to fire @@ -342,6 +345,7 @@ int alarm_start_relative(struct alarm *alarm, ktime_t start) start = ktime_add(start, base->gettime()); return alarm_start(alarm, start); } +EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { @@ -354,6 +358,7 @@ void alarm_restart(struct alarm *alarm) alarmtimer_enqueue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); } +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -375,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -392,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -424,6 +431,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { @@ -431,7 +439,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) return alarm_forward(alarm, base->gettime(), interval); } - +EXPORT_SYMBOL_GPL(alarm_forward_now); /** -- cgit v1.2.3 From 38ff87f77af0b5a93fc8581cff1d6e5692ab8970 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 1 Jun 2013 23:39:40 -0700 Subject: sched_clock: Make ARM's sched_clock generic for all architectures Nothing about the sched_clock implementation in the ARM port is specific to the architecture. Generalize the code so that other architectures can use it by selecting GENERIC_SCHED_CLOCK. Signed-off-by: Stephen Boyd [jstultz: Merge minor collisions with other patches in my tree] Signed-off-by: John Stultz --- kernel/time/Makefile | 1 + kernel/time/sched_clock.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 kernel/time/sched_clock.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index d52ac8bf0006..9250130646f5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -4,6 +4,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000000..aad1ae6077ef --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,215 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct clock_data { + u64 epoch_ns; + u32 epoch_cyc; + u32 epoch_cyc_copy; + unsigned long rate; + u32 mult; + u32 shift; + bool suspended; +}; + +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { + .mult = NSEC_PER_SEC / HZ, +}; + +static u32 __read_mostly sched_clock_mask = 0xffffffff; + +static u32 notrace jiffy_sched_clock_read(void) +{ + return (u32)(jiffies - INITIAL_JIFFIES); +} + +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +{ + u64 epoch_ns; + u32 epoch_cyc; + + /* + * Load the epoch_cyc and epoch_ns atomically. We do this by + * ensuring that we always write epoch_cyc, epoch_ns and + * epoch_cyc_copy in strict order, and read them in strict order. + * If epoch_cyc and epoch_cyc_copy are not equal, then we're in + * the middle of an update, and we should repeat the load. + */ + do { + epoch_cyc = cd.epoch_cyc; + smp_rmb(); + epoch_ns = cd.epoch_ns; + smp_rmb(); + } while (epoch_cyc != cd.epoch_cyc_copy); + + return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ + unsigned long flags; + u32 cyc; + u64 ns; + + cyc = read_sched_clock(); + ns = cd.epoch_ns + + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + /* + * Write epoch_cyc and epoch_ns in a way that the update is + * detectable in cyc_to_fixed_sched_clock(). + */ + raw_local_irq_save(flags); + cd.epoch_cyc_copy = cyc; + smp_wmb(); + cd.epoch_ns = ns; + smp_wmb(); + cd.epoch_cyc = cyc; + raw_local_irq_restore(flags); +} + +static void sched_clock_poll(unsigned long wrap_ticks) +{ + mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); + update_sched_clock(); +} + +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +{ + unsigned long r, w; + u64 res, wrap; + char r_unit; + + if (cd.rate > rate) + return; + + BUG_ON(bits > 32); + WARN_ON(!irqs_disabled()); + read_sched_clock = read; + sched_clock_mask = (1 << bits) - 1; + cd.rate = rate; + + /* calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else + r_unit = ' '; + + /* calculate how many ns until we wrap */ + wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); + do_div(wrap, NSEC_PER_MSEC); + w = wrap; + + /* calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, cd.mult, cd.shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", + bits, r, r_unit, res, w); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); + update_sched_clock(); + + /* + * Ensure that sched_clock() starts off at 0ns + */ + cd.epoch_ns = 0; + + /* Enable IRQ time accounting if we have a fast enough sched_clock */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +static unsigned long long notrace sched_clock_32(void) +{ + u32 cyc = read_sched_clock(); + return cyc_to_sched_clock(cyc, sched_clock_mask); +} + +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) +{ + if (cd.suspended) + return cd.epoch_ns; + + return sched_clock_func(); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock function has been provided at that point, + * make it the final one one. + */ + if (read_sched_clock == jiffy_sched_clock_read) + setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + + sched_clock_poll(sched_clock_timer.data); +} + +static int sched_clock_suspend(void) +{ + sched_clock_poll(sched_clock_timer.data); + cd.suspended = true; + return 0; +} + +static void sched_clock_resume(void) +{ + cd.epoch_cyc = read_sched_clock(); + cd.epoch_cyc_copy = cd.epoch_cyc; + cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + return 0; +} +device_initcall(sched_clock_syscore_init); -- cgit v1.2.3 From 33ad801dfb5c8b1127c72fdb745ce8c630150f3f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:15:08 +0800 Subject: cpuset: record old_mems_allowed in struct cpuset When we update a cpuset's mems_allowed and thus update tasks' mems_allowed, it's required to pass the old mems_allowed and new mems_allowed to cpuset_migrate_mm(). Currently we save old mems_allowed in a temp local variable before changing cpuset->mems_allowed. This patch changes it by saving old mems_allowed in cpuset->old_mems_allowed. This currently won't change any behavior, but it will later allow us to keep tasks in empty cpusets. v3: restored "cpuset_attach_nodemask_to = cs->mems_allowed" Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 61 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 608fe1308b22..2b4554588a04 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -88,6 +88,18 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + struct fmeter fmeter; /* memory_pressure filter */ /* @@ -972,16 +984,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) { + struct cpuset *cs = cgroup_cs(scan->cg); struct mm_struct *mm; - struct cpuset *cs; int migrate; - const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cpuset_mutex */ - - cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); + nodemask_t *newmems = scan->data; - cpuset_change_task_nodemask(p, &newmems); + cpuset_change_task_nodemask(p, newmems); mm = get_task_mm(p); if (!mm) @@ -991,7 +999,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); mmput(mm); } @@ -1000,25 +1008,26 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ -static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, - struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { + static nodemask_t newmems; /* protected by cpuset_mutex */ struct cgroup_scanner scan; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ + guarantee_online_mems(cs, &newmems); + scan.cg = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_nodemask; scan.heap = heap; - scan.data = (nodemask_t *)oldmem; + scan.data = &newmems; /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1032,6 +1041,12 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, */ cgroup_scan_tasks(&scan); + /* + * All the tasks' nodemasks have been updated, update + * cs->old_mems_allowed. + */ + cs->old_mems_allowed = newmems; + /* We're done rebinding vmas to this cpuset's new mems_allowed. */ cpuset_being_rebound = NULL; } @@ -1052,13 +1067,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; - if (!oldmem) - return -ENOMEM; - /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; * it's read-only @@ -1087,8 +1098,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; } } - *oldmem = cs->mems_allowed; - if (nodes_equal(*oldmem, trialcs->mems_allowed)) { + + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1104,11 +1115,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, oldmem, &heap); + update_tasks_nodemask(cs, &heap); heap_free(&heap); done: - NODEMASK_FREE(oldmem); return retval; } @@ -1431,6 +1441,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) mmput(mm); } + cs->old_mems_allowed = cpuset_attach_nodemask_to; + cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); @@ -1985,7 +1997,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) static void cpuset_hotplug_update_tasks(struct cpuset *cs) { static cpumask_t off_cpus; - static nodemask_t off_mems, tmp_mems; + static nodemask_t off_mems; bool is_empty; retry: @@ -2015,11 +2027,10 @@ retry: /* remove offline mems from @cs */ if (!nodes_empty(off_mems)) { - tmp_mems = cs->mems_allowed; mutex_lock(&callback_mutex); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &tmp_mems, NULL); + update_tasks_nodemask(cs, NULL); } is_empty = cpumask_empty(cs->cpus_allowed) || @@ -2083,11 +2094,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - tmp_mems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + update_tasks_nodemask(&top_cpuset, NULL); } mutex_unlock(&cpuset_mutex); @@ -2158,6 +2168,7 @@ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; + top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); } -- cgit v1.2.3 From 070b57fcacc9dfc23a180290079078373fb697e1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:15:22 +0800 Subject: cpuset: introduce effective_{cpumask|nodemask}_cpuset() effective_cpumask_cpuset() returns an ancestor cpuset which has non-empty cpumask. If a cpuset is empty and the tasks in it need to update their cpus_allowed, they take on the ancestor cpuset's cpumask. This currently won't change any behavior, but it will later allow us to keep tasks in empty cpusets. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2b4554588a04..82ac1f862cbc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -791,6 +791,45 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } +/* + * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus + * @cs: the cpuset in interest + * + * A cpuset's effective cpumask is the cpumask of the nearest ancestor + * with non-empty cpus. We use effective cpumask whenever: + * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask + * if the cpuset they reside in has no cpus) + * - we want to retrieve task_cs(tsk)'s cpus_allowed. + * + * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an + * exception. See comments there. + */ +static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) +{ + while (cpumask_empty(cs->cpus_allowed)) + cs = parent_cs(cs); + return cs; +} + +/* + * effective_nodemask_cpuset - return nearest ancestor with non-empty mems + * @cs: the cpuset in interest + * + * A cpuset's effective nodemask is the nodemask of the nearest ancestor + * with non-empty memss. We use effective nodemask whenever: + * - we update tasks' mems_allowed. (they take on the ancestor's nodemask + * if the cpuset they reside in has no mems) + * - we want to retrieve task_cs(tsk)'s mems_allowed. + * + * Called with cpuset_mutex held. + */ +static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) +{ + while (nodes_empty(cs->mems_allowed)) + cs = parent_cs(cs); + return cs; +} + /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test @@ -805,7 +844,10 @@ void rebuild_sched_domains(void) static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); + struct cpuset *cpus_cs; + + cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); + set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } /** @@ -920,12 +962,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; + struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); - guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); + mems_cs = effective_nodemask_cpuset(task_cs(tsk)); + guarantee_online_mems(mems_cs, &tsk->mems_allowed); } /* @@ -1018,10 +1062,11 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct cgroup_scanner scan; + struct cpuset *mems_cs = effective_nodemask_cpuset(cs); cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - guarantee_online_mems(cs, &newmems); + guarantee_online_mems(mems_cs, &newmems); scan.cg = cs->css.cgroup; scan.test_task = NULL; @@ -1405,6 +1450,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1412,9 +1459,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cs, cpus_attach); + guarantee_online_cpus(cpus_cs, cpus_attach); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, cgrp, tset) { /* @@ -1434,9 +1481,11 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(leader); if (mm) { + struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, + cpuset_migrate_mm(mm, &mems_oldcs->mems_allowed, &cpuset_attach_nodemask_to); mmput(mm); } @@ -2186,20 +2235,23 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { + struct cpuset *cpus_cs; + mutex_lock(&callback_mutex); task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); + cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); + guarantee_online_cpus(cpus_cs, pmask); task_unlock(tsk); mutex_unlock(&callback_mutex); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cs; + const struct cpuset *cpus_cs; rcu_read_lock(); - cs = task_cs(tsk); - do_set_cpus_allowed(tsk, cs->cpus_allowed); + cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); + do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); rcu_read_unlock(); /* @@ -2238,11 +2290,13 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { + struct cpuset *mems_cs; nodemask_t mask; mutex_lock(&callback_mutex); task_lock(tsk); - guarantee_online_mems(task_cs(tsk), &mask); + mems_cs = effective_nodemask_cpuset(task_cs(tsk)); + guarantee_online_mems(mems_cs, &mask); task_unlock(tsk); mutex_unlock(&callback_mutex); -- cgit v1.2.3 From 5c5cc62321d9df7a9a608346fc649c4528380c8f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:16:29 +0800 Subject: cpuset: allow to keep tasks in empty cpusets To achieve this: - We call update_tasks_cpumask/nodemask() for empty cpusets when hotplug happens, instead of moving tasks out of them. - When a cpuset's masks are changed by writing cpuset.cpus/mems, we also update tasks in child cpusets which are empty. v3: - do propagation work in one place for both hotplug and unplug v2: - drop rcu_read_lock before calling update_task_nodemask() and update_task_cpumask(), instead of using workqueue. - add documentation in include/linux/cgroup.h Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 141 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 110 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 82ac1f862cbc..3473dd2580d1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) cgroup_scan_tasks(&scan); } +/* + * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. + * @root_cs: the root cpuset of the hierarchy + * @update_root: update root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update cpumasks of tasks in @root_cs and all other empty cpusets + * which take on cpumask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_cpumask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup *pos_cgrp; + + if (update_root) + update_tasks_cpumask(root_cs, heap); + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + continue; + } + if (!css_tryget(&cp->css)) + continue; + rcu_read_unlock(); + + update_tasks_cpumask(cp, heap); + + rcu_read_lock(); + css_put(&cp->css); + } + rcu_read_unlock(); +} + /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider @@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - /* - * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. - */ - update_tasks_cpumask(cs, &heap); + update_tasks_cpumask_hier(cs, true, &heap); heap_free(&heap); @@ -1096,6 +1131,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) cpuset_being_rebound = NULL; } +/* + * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. + * @cs: the root cpuset of the hierarchy + * @update_root: update the root cpuset or not? + * @heap: the heap used by cgroup_scan_tasks() + * + * This will update nodemasks of tasks in @root_cs and all other empty cpusets + * which take on nodemask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_nodemask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup *pos_cgrp; + + if (update_root) + update_tasks_nodemask(root_cs, heap); + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + continue; + } + if (!css_tryget(&cp->css)) + continue; + rcu_read_unlock(); + + update_tasks_nodemask(cp, heap); + + rcu_read_lock(); + css_put(&cp->css); + } + rcu_read_unlock(); +} + /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the @@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &heap); + update_tasks_nodemask_hier(cs, true, &heap); heap_free(&heap); done: @@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) static cpumask_t off_cpus; static nodemask_t off_mems; bool is_empty; + bool sane = cgroup_sane_behavior(cs->css.cgroup); retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2066,21 +2141,29 @@ retry: cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - /* remove offline cpus from @cs */ - if (!cpumask_empty(&off_cpus)) { - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + + /* + * If sane_behavior flag is set, we need to update tasks' cpumask + * for empty cpuset to take on ancestor's cpumask. + */ + if ((sane && cpumask_empty(cs->cpus_allowed)) || + !cpumask_empty(&off_cpus)) update_tasks_cpumask(cs, NULL); - } - /* remove offline mems from @cs */ - if (!nodes_empty(off_mems)) { - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + + /* + * If sane_behavior flag is set, we need to update tasks' nodemask + * for empty cpuset to take on ancestor's nodemask. + */ + if ((sane && nodes_empty(cs->mems_allowed)) || + !nodes_empty(off_mems)) update_tasks_nodemask(cs, NULL); - } is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2088,11 +2171,13 @@ retry: mutex_unlock(&cpuset_mutex); /* - * If @cs became empty, move tasks to the nearest ancestor with - * execution resources. This is full cgroup operation which will + * If sane_behavior flag is set, we'll keep tasks in empty cpusets. + * + * Otherwise move tasks to the nearest ancestor with execution + * resources. This is full cgroup operation which will * also call back into cpuset. Should be done outside any lock. */ - if (is_empty) + if (!sane && is_empty) remove_tasks_in_empty_cpuset(cs); } @@ -2114,10 +2199,9 @@ retry: */ static void cpuset_hotplug_workfn(struct work_struct *work) { - static cpumask_t new_cpus, tmp_cpus; - static nodemask_t new_mems, tmp_mems; + static cpumask_t new_cpus; + static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool cpus_offlined, mems_offlined; mutex_lock(&cpuset_mutex); @@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) new_mems = node_states[N_MEMORY]; cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, - &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); - nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); - mems_offlined = !nodes_empty(tmp_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { @@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) mutex_unlock(&cpuset_mutex); - /* if cpus or mems went down, we need to propagate to descendants */ - if (cpus_offlined || mems_offlined) { + /* if cpus or mems changed, we need to propagate to descendants */ + if (cpus_updated || mems_updated) { struct cpuset *cs; struct cgroup *pos_cgrp; -- cgit v1.2.3 From 88fa523bff295f1d60244a54833480b02f775152 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 9 Jun 2013 17:16:46 +0800 Subject: cpuset: allow to move tasks to empty cpusets Currently some cpuset behaviors are not friendly when cpuset is co-mounted with other cgroup controllers. Now with this patchset if cpuset is mounted with sane_behavior option, it behaves differently: - Tasks will be kept in empty cpusets when hotplug happens and take masks of ancestors with non-empty cpus/mems, instead of being moved to an ancestor. - A task can be moved into an empty cpuset, and again it takes masks of ancestors, so the user can drop a task into a newly created cgroup without having to do anything for it. As tasks can reside in empy cpusets, here're some rules: - They can be moved to another cpuset, regardless it's empty or not. - Though it takes masks from ancestors, it takes other configs from the empty cpuset. - If the ancestors' masks are changed, those tasks will also be updated to take new masks. v2: add documentation in include/linux/cgroup.h Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3473dd2580d1..3b3fdfdd4d78 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -479,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) */ ret = -ENOSPC; if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && - (cpumask_empty(trial->cpus_allowed) || + (cpumask_empty(trial->cpus_allowed) && nodes_empty(trial->mems_allowed))) goto out; @@ -1466,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); + /* + * We allow to move tasks into an empty cpuset if sane_behavior + * flag is set. + */ ret = -ENOSPC; - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (!cgroup_sane_behavior(cgrp) && + (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { -- cgit v1.2.3 From f047cecf2cfc9595b1f39c9aab383bb0682f5a53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 13 Jun 2013 15:11:44 +0800 Subject: cpuset: fix to migrate mm correctly in a corner case Before moving tasks out of empty cpusets, update_tasks_nodemask() is called, which calls do_migrate_pages(xx, from, to). Then those tasks are moved to an ancestor, and do_migrate_pages() is called again. The first time: from = node_to_be_offlined, to = empty. The second time: from = empty, to = ancestor's nodemask. so looks like no pages will be migrated. Fix this by: - Don't call update_tasks_nodemask() on empty cpusets. - Pass cs->old_mems_allowed to do_migrate_pages(). v4: added comment in cpuset_hotplug_update_tasks() and rephased comment in cpuset_attach(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3b3fdfdd4d78..4c17d96bd3a5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1563,9 +1563,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &mems_oldcs->mems_allowed, + + /* + * old_mems_allowed is the same with mems_allowed here, except + * if this task is being moved automatically due to hotplug. + * In that case @mems_allowed has been updated and is empty, + * so @old_mems_allowed is the right nodesets that we migrate + * mm from. + */ + if (is_memory_migrate(cs)) { + cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); + } mmput(mm); } @@ -2152,10 +2161,12 @@ retry: /* * If sane_behavior flag is set, we need to update tasks' cpumask - * for empty cpuset to take on ancestor's cpumask. + * for empty cpuset to take on ancestor's cpumask. Otherwise, don't + * call update_tasks_cpumask() if the cpuset becomes empty, as + * the tasks in it will be migrated to an ancestor. */ if ((sane && cpumask_empty(cs->cpus_allowed)) || - !cpumask_empty(&off_cpus)) + (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) update_tasks_cpumask(cs, NULL); mutex_lock(&callback_mutex); @@ -2164,10 +2175,12 @@ retry: /* * If sane_behavior flag is set, we need to update tasks' nodemask - * for empty cpuset to take on ancestor's nodemask. + * for empty cpuset to take on ancestor's nodemask. Otherwise, don't + * call update_tasks_nodemask() if the cpuset becomes empty, as + * the tasks in it will be migratd to an ancestor. */ if ((sane && nodes_empty(cs->mems_allowed)) || - !nodes_empty(off_mems)) + (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) update_tasks_nodemask(cs, NULL); is_empty = cpumask_empty(cs->cpus_allowed) || -- cgit v1.2.3 From 3fc3db9a3ae0ce108badf31a4a00e41b4236f5fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:48 -0700 Subject: cgroup: remove now unused css_depth() Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc53d5014b28..d4a329f5874c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5227,18 +5227,6 @@ unsigned short css_id(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_id); -unsigned short css_depth(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - cssid = rcu_dereference_check(css->id, css_refcnt(css)); - - if (cssid) - return cssid->depth; - return 0; -} -EXPORT_SYMBOL_GPL(css_depth); - /** * css_is_ancestor - test "root" css is an ancestor of "child" * @child: the css to be tested. -- cgit v1.2.3 From 5abb8855734fd7b3fa7f91c13916d0e35d99763c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:49 -0700 Subject: cgroup: consistently use @cset for struct css_set variables cgroup.c uses @cg for most struct css_set variables, which in itself could be a bit confusing, but made much worse by the fact that there are places which use @cg for struct cgroup variables. compare_css_sets() epitomizes this confusion - @[old_]cg are struct css_set while @cg[12] are struct cgroup. It's not like the whole deal with cgroup, css_set and cg_cgroup_link isn't already confusing enough. Let's give it some sanity by uniformly using @cset for all struct css_set variables. * s/cg/cset/ for all css_set variables. * s/oldcg/old_cset/ s/oldcgrp/old_cgrp/. The same for the ones prefixed with "new". * s/cg/cgrp/ for cgroup variables in compare_css_sets(). * s/css/cset/ for the cgroup variable in task_cgroup_from_root(). * Whiteline adjustments. This patch is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 216 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 109 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d4a329f5874c..1f5a4e101ed1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -376,30 +376,32 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -static void __put_css_set(struct css_set *cg, int taskexit) +static void __put_css_set(struct css_set *cset, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cg->refcount, -1, 1)) + if (atomic_add_unless(&cset->refcount, -1, 1)) return; write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cg->refcount)) { + if (!atomic_dec_and_test(&cset->refcount)) { write_unlock(&css_set_lock); return; } /* This css_set is dead. unlink it and release cgroup refcounts */ - hash_del(&cg->hlist); + hash_del(&cset->hlist); css_set_count--; - list_for_each_entry_safe(link, saved_link, &cg->cg_links, + list_for_each_entry_safe(link, saved_link, &cset->cg_links, cg_link_list) { struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); @@ -421,45 +423,45 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - kfree_rcu(cg, rcu_head); + kfree_rcu(cset, rcu_head); } /* * refcounted get/put for css_set objects */ -static inline void get_css_set(struct css_set *cg) +static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cg->refcount); + atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cg) +static inline void put_css_set(struct css_set *cset) { - __put_css_set(cg, 0); + __put_css_set(cset, 0); } -static inline void put_css_set_taskexit(struct css_set *cg) +static inline void put_css_set_taskexit(struct css_set *cset) { - __put_css_set(cg, 1); + __put_css_set(cset, 1); } /* * compare_css_sets - helper function for find_existing_css_set(). - * @cg: candidate css_set being tested - * @old_cg: existing css_set for a task + * @cset: candidate css_set being tested + * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cg" matches "old_cg" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ -static bool compare_css_sets(struct css_set *cg, - struct css_set *old_cg, +static bool compare_css_sets(struct css_set *cset, + struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct list_head *l1, *l2; - if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { /* Not all subsystems matched */ return false; } @@ -473,28 +475,28 @@ static bool compare_css_sets(struct css_set *cg, * candidates. */ - l1 = &cg->cg_links; - l2 = &old_cg->cg_links; + l1 = &cset->cg_links; + l2 = &old_cset->cg_links; while (1) { struct cg_cgroup_link *cgl1, *cgl2; - struct cgroup *cg1, *cg2; + struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ - if (l1 == &cg->cg_links) { - BUG_ON(l2 != &old_cg->cg_links); + if (l1 == &cset->cg_links) { + BUG_ON(l2 != &old_cset->cg_links); break; } else { - BUG_ON(l2 == &old_cg->cg_links); + BUG_ON(l2 == &old_cset->cg_links); } /* Locate the cgroups associated with these links. */ cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cg1 = cgl1->cgrp; - cg2 = cgl2->cgrp; + cgrp1 = cgl1->cgrp; + cgrp2 = cgl2->cgrp; /* Hierarchies should be linked in the same order. */ - BUG_ON(cg1->root != cg2->root); + BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup @@ -503,11 +505,11 @@ static bool compare_css_sets(struct css_set *cg, * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ - if (cg1->root == new_cgrp->root) { - if (cg1 != new_cgrp) + if (cgrp1->root == new_cgrp->root) { + if (cgrp1 != new_cgrp) return false; } else { - if (cg1 != cg2) + if (cgrp1 != cgrp2) return false; } } @@ -527,14 +529,13 @@ static bool compare_css_sets(struct css_set *cg, * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ -static struct css_set *find_existing_css_set( - struct css_set *oldcg, - struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) +static struct css_set *find_existing_css_set(struct css_set *old_cset, + struct cgroup *cgrp, + struct cgroup_subsys_state *template[]) { int i; struct cgroupfs_root *root = cgrp->root; - struct css_set *cg; + struct css_set *cset; unsigned long key; /* @@ -551,17 +552,17 @@ static struct css_set *find_existing_css_set( } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ - template[i] = oldcg->subsys[i]; + template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); - hash_for_each_possible(css_set_table, cg, hlist, key) { - if (!compare_css_sets(cg, oldcg, cgrp, template)) + hash_for_each_possible(css_set_table, cset, hlist, key) { + if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ - return cg; + return cset; } /* No existing cgroup group matched */ @@ -603,18 +604,18 @@ static int allocate_cg_links(int count, struct list_head *tmp) /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() - * @cg: the css_set to be linked + * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_cg_links, - struct css_set *cg, struct cgroup *cgrp) + struct css_set *cset, struct cgroup *cgrp) { struct cg_cgroup_link *link; BUG_ON(list_empty(tmp_cg_links)); link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); - link->cg = cg; + link->cg = cset; link->cgrp = cgrp; atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); @@ -622,7 +623,7 @@ static void link_css_set(struct list_head *tmp_cg_links, * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation */ - list_add_tail(&link->cg_link_list, &cg->cg_links); + list_add_tail(&link->cg_link_list, &cset->cg_links); } /* @@ -632,10 +633,10 @@ static void link_css_set(struct list_head *tmp_cg_links, * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ -static struct css_set *find_css_set( - struct css_set *oldcg, struct cgroup *cgrp) +static struct css_set *find_css_set(struct css_set *old_cset, + struct cgroup *cgrp) { - struct css_set *res; + struct css_set *cset; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; struct list_head tmp_cg_links; @@ -646,40 +647,40 @@ static struct css_set *find_css_set( /* First see if we already have a cgroup group that matches * the desired set */ read_lock(&css_set_lock); - res = find_existing_css_set(oldcg, cgrp, template); - if (res) - get_css_set(res); + cset = find_existing_css_set(old_cset, cgrp, template); + if (cset) + get_css_set(cset); read_unlock(&css_set_lock); - if (res) - return res; + if (cset) + return cset; - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) + cset = kmalloc(sizeof(*cset), GFP_KERNEL); + if (!cset) return NULL; /* Allocate all the cg_cgroup_link objects that we'll need */ if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { - kfree(res); + kfree(cset); return NULL; } - atomic_set(&res->refcount, 1); - INIT_LIST_HEAD(&res->cg_links); - INIT_LIST_HEAD(&res->tasks); - INIT_HLIST_NODE(&res->hlist); + atomic_set(&cset->refcount, 1); + INIT_LIST_HEAD(&cset->cg_links); + INIT_LIST_HEAD(&cset->tasks); + INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ - memcpy(res->subsys, template, sizeof(res->subsys)); + memcpy(cset->subsys, template, sizeof(cset->subsys)); write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + list_for_each_entry(link, &old_cset->cg_links, cg_link_list) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; - link_css_set(&tmp_cg_links, res, c); + link_css_set(&tmp_cg_links, cset, c); } BUG_ON(!list_empty(&tmp_cg_links)); @@ -687,12 +688,12 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - key = css_set_hash(res->subsys); - hash_add(css_set_table, &res->hlist, key); + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); write_unlock(&css_set_lock); - return res; + return cset; } /* @@ -702,7 +703,7 @@ static struct css_set *find_css_set( static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroupfs_root *root) { - struct css_set *css; + struct css_set *cset; struct cgroup *res = NULL; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -712,12 +713,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * task can't change groups, so the only thing that can happen * is that it exits and its css is set back to init_css_set. */ - css = task->cgroups; - if (css == &init_css_set) { + cset = task->cgroups; + if (cset == &init_css_set) { res = &root->top_cgroup; } else { struct cg_cgroup_link *link; - list_for_each_entry(link, &css->cg_links, cg_link_list) { + list_for_each_entry(link, &cset->cg_links, cg_link_list) { struct cgroup *c = link->cgrp; if (c->root == root) { res = c; @@ -1608,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; - struct css_set *cg; + struct css_set *cset; BUG_ON(sb->s_root != NULL); @@ -1666,8 +1667,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cg, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_cg_links, cset, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1944,10 +1945,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *oldcgrp, - struct task_struct *tsk, struct css_set *newcg) +static void cgroup_task_migrate(struct cgroup *old_cgrp, + struct task_struct *tsk, + struct css_set *new_cset) { - struct css_set *oldcg; + struct css_set *old_cset; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1955,25 +1957,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); - oldcg = tsk->cgroups; + old_cset = tsk->cgroups; task_lock(tsk); - rcu_assign_pointer(tsk->cgroups, newcg); + rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); + list_move(&tsk->cg_list, &new_cset->tasks); write_unlock(&css_set_lock); /* - * We just gained a reference on oldcg by taking it from the task. As - * trading it for newcg is protected by cgroup_mutex, we're safe to drop - * it here; it will be freed under RCU. + * We just gained a reference on old_cset by taking it from the + * task. As trading it for new_cset is protected by cgroup_mutex, + * we're safe to drop it here; it will be freed under RCU. */ - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - put_css_set(oldcg); + set_bit(CGRP_RELEASABLE, &old_cgrp->flags); + put_css_set(old_cset); } /** @@ -2925,7 +2927,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp, { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; - struct css_set *cg; + struct css_set *cset; /* Advance to the next non-empty css_set */ do { @@ -2935,10 +2937,10 @@ static void cgroup_advance_iter(struct cgroup *cgrp, return; } link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cg = link->cg; - } while (list_empty(&cg->tasks)); + cset = link->cg; + } while (list_empty(&cset->tasks)); it->cg_link = l; - it->task = cg->tasks.next; + it->task = cset->tasks.next; } /* @@ -4516,7 +4518,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int i, ret; struct hlist_node *tmp; - struct css_set *cg; + struct css_set *cset; unsigned long key; /* check name and function validity */ @@ -4583,17 +4585,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { + hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) + if (cset->subsys[ss->subsys_id]) continue; /* remove existing entry */ - hash_del(&cg->hlist); + hash_del(&cset->hlist); /* set new value */ - cg->subsys[ss->subsys_id] = css; + cset->subsys[ss->subsys_id] = css; /* recompute hash and restore entry */ - key = css_set_hash(cg->subsys); - hash_add(css_set_table, &cg->hlist, key); + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); } write_unlock(&css_set_lock); @@ -4653,13 +4655,13 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) */ write_lock(&css_set_lock); list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; + struct css_set *cset = link->cg; unsigned long key; - hash_del(&cg->hlist); - cg->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cg->subsys); - hash_add(css_set_table, &cg->hlist, key); + hash_del(&cset->hlist); + cset->subsys[ss->subsys_id] = NULL; + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); } write_unlock(&css_set_lock); @@ -5006,7 +5008,7 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - struct css_set *cg; + struct css_set *cset; int i; /* @@ -5023,7 +5025,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) /* Reassign the task to the init_css_set. */ task_lock(tsk); - cg = tsk->cgroups; + cset = tsk->cgroups; tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { @@ -5036,7 +5038,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (ss->exit) { struct cgroup *old_cgrp = - rcu_dereference_raw(cg->subsys[i])->cgroup; + rcu_dereference_raw(cset->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); ss->exit(cgrp, old_cgrp, tsk); } @@ -5044,7 +5046,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - put_css_set_taskexit(cg); + put_css_set_taskexit(cset); } static void check_for_release(struct cgroup *cgrp) @@ -5453,12 +5455,12 @@ static int current_css_set_cg_links_read(struct cgroup *cont, struct seq_file *seq) { struct cg_cgroup_link *link; - struct css_set *cg; + struct css_set *cset; read_lock(&css_set_lock); rcu_read_lock(); - cg = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cg->cg_links, cg_link_list) { + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cg_links, cg_link_list) { struct cgroup *c = link->cgrp; const char *name; @@ -5483,11 +5485,11 @@ static int cgroup_css_links_read(struct cgroup *cont, read_lock(&css_set_lock); list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; + struct css_set *cset = link->cg; struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cg); - list_for_each_entry(task, &cg->tasks, cg_list) { + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) { seq_puts(seq, " ...\n"); break; -- cgit v1.2.3 From 69d0206c793a17431eacee2694ee7a4b25df76b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:50 -0700 Subject: cgroup: bring some sanity to naming around cg_cgroup_link cgroups and css_sets are mapped M:N and this M:N mapping is represented by struct cg_cgroup_link which forms linked lists on both sides. The naming around this mapping is already confusing and struct cg_cgroup_link exacerbates the situation quite a bit. >From cgroup side, it starts off ->css_sets and runs through ->cgrp_link_list. From css_set side, it starts off ->cg_links and runs through ->cg_link_list. This is rather reversed as cgrp_link_list is used to iterate css_sets and cg_link_list cgroups. Also, this is the only place which is still using the confusing "cg" for css_sets. This patch cleans it up a bit. * s/cgroup->css_sets/cgroup->cset_links/ s/css_set->cg_links/css_set->cgrp_links/ s/cgroup_iter->cg_link/cgroup_iter->cset_link/ * s/cg_cgroup_link/cgrp_cset_link/ * s/cgrp_cset_link->cg/cgrp_cset_link->cset/ s/cgrp_cset_link->cgrp_link_list/cgrp_cset_link->cset_link/ s/cgrp_cset_link->cg_link_list/cgrp_cset_link->cgrp_link/ * s/init_css_set_link/init_cgrp_cset_link/ s/free_cg_links/free_cgrp_cset_links/ s/allocate_cg_links/allocate_cgrp_cset_links/ * s/cgl[12]/link[12]/ in compare_css_sets() * s/saved_link/tmp_link/ s/tmp/tmp_links/ and a couple similar adustments. * Comment and whiteline adjustments. After the changes, we have list_for_each_entry(link, &cont->cset_links, cset_link) { struct css_set *cset = link->cset; instead of list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { struct css_set *cset = link->cg; This patch is purely cosmetic. v2: Fix broken sentences in the patch description. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 226 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 113 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1f5a4e101ed1..ef97bd0cd546 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -315,20 +315,24 @@ static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { - /* - * List running through cg_cgroup_links associated with a - * cgroup, anchored on cgroup->css_sets - */ - struct list_head cgrp_link_list; - struct cgroup *cgrp; - /* - * List running through cg_cgroup_links pointing at a - * single css_set object, anchored on css_set->cg_links - */ - struct list_head cg_link_list; - struct css_set *cg; +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; }; /* The default css_set - used by init and its children prior to any @@ -339,7 +343,7 @@ struct cg_cgroup_link { */ static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; +static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); @@ -378,8 +382,7 @@ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; /* * Ensure that the refcount doesn't hit zero while any readers @@ -398,12 +401,11 @@ static void __put_css_set(struct css_set *cset, int taskexit) hash_del(&cset->hlist); css_set_count--; - list_for_each_entry_safe(link, saved_link, &cset->cg_links, - cg_link_list) { + list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { struct cgroup *cgrp = link->cgrp; - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + list_del(&link->cset_link); + list_del(&link->cgrp_link); /* * We may not be holding cgroup_mutex, and if cgrp->count is @@ -475,26 +477,26 @@ static bool compare_css_sets(struct css_set *cset, * candidates. */ - l1 = &cset->cg_links; - l2 = &old_cset->cg_links; + l1 = &cset->cgrp_links; + l2 = &old_cset->cgrp_links; while (1) { - struct cg_cgroup_link *cgl1, *cgl2; + struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ - if (l1 == &cset->cg_links) { - BUG_ON(l2 != &old_cset->cg_links); + if (l1 == &cset->cgrp_links) { + BUG_ON(l2 != &old_cset->cgrp_links); break; } else { - BUG_ON(l2 == &old_cset->cg_links); + BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ - cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); - cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cgrp1 = cgl1->cgrp; - cgrp2 = cgl2->cgrp; + link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); + link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); + cgrp1 = link1->cgrp; + cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); @@ -569,61 +571,64 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, return NULL; } -static void free_cg_links(struct list_head *tmp) +static void free_cgrp_cset_links(struct list_head *links_to_free) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { + list_del(&link->cset_link); kfree(link); } } -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error +/** + * allocate_cgrp_cset_links - allocate cgrp_cset_links + * @count: the number of links to allocate + * @tmp_links: list_head the allocated links are put on + * + * Allocate @count cgrp_cset_link structures and chain them on @tmp_links + * through ->cset_link. Returns 0 on success or -errno. */ -static int allocate_cg_links(int count, struct list_head *tmp) +static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; int i; - INIT_LIST_HEAD(tmp); + + INIT_LIST_HEAD(tmp_links); + for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - free_cg_links(tmp); + free_cgrp_cset_links(tmp_links); return -ENOMEM; } - list_add(&link->cgrp_link_list, tmp); + list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup - * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ -static void link_css_set(struct list_head *tmp_cg_links, - struct css_set *cset, struct cgroup *cgrp) +static void link_css_set(struct list_head *tmp_links, struct css_set *cset, + struct cgroup *cgrp) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; - BUG_ON(list_empty(tmp_cg_links)); - link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, - cgrp_link_list); - link->cg = cset; + BUG_ON(list_empty(tmp_links)); + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); + link->cset = cset; link->cgrp = cgrp; atomic_inc(&cgrp->count); - list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_move(&link->cset_link, &cgrp->cset_links); /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation */ - list_add_tail(&link->cg_link_list, &cset->cg_links); + list_add_tail(&link->cgrp_link, &cset->cgrp_links); } /* @@ -638,10 +643,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, { struct css_set *cset; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - struct list_head tmp_cg_links; - - struct cg_cgroup_link *link; + struct list_head tmp_links; + struct cgrp_cset_link *link; unsigned long key; /* First see if we already have a cgroup group that matches @@ -659,14 +662,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, if (!cset) return NULL; - /* Allocate all the cg_cgroup_link objects that we'll need */ - if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { + /* Allocate all the cgrp_cset_link objects that we'll need */ + if (allocate_cgrp_cset_links(root_count, &tmp_links) < 0) { kfree(cset); return NULL; } atomic_set(&cset->refcount, 1); - INIT_LIST_HEAD(&cset->cg_links); + INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_HLIST_NODE(&cset->hlist); @@ -676,14 +679,15 @@ static struct css_set *find_css_set(struct css_set *old_cset, write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &old_cset->cg_links, cg_link_list) { + list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) c = cgrp; - link_css_set(&tmp_cg_links, cset, c); + link_css_set(&tmp_links, cset, c); } - BUG_ON(!list_empty(&tmp_cg_links)); + BUG_ON(!list_empty(&tmp_links)); css_set_count++; @@ -717,9 +721,11 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, if (cset == &init_css_set) { res = &root->top_cgroup; } else { - struct cg_cgroup_link *link; - list_for_each_entry(link, &cset->cg_links, cg_link_list) { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == root) { res = c; break; @@ -1405,7 +1411,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); - INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1604,7 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_cg_links; + struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; const struct cred *cred; @@ -1636,7 +1642,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * that's us. The worst that can happen is that we * have some link structures left over */ - ret = allocate_cg_links(css_set_count, &tmp_cg_links); + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) goto unlock_drop; @@ -1646,7 +1652,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { - free_cg_links(&tmp_cg_links); + free_cgrp_cset_links(&tmp_links); goto unlock_drop; } /* @@ -1668,10 +1674,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * the css_set objects */ write_lock(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_cg_links, cset, root_cgrp); + link_css_set(&tmp_links, cset, root_cgrp); write_unlock(&css_set_lock); - free_cg_links(&tmp_cg_links); + free_cgrp_cset_links(&tmp_links); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); @@ -1722,9 +1728,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; int ret; - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; BUG_ON(!root); @@ -1740,15 +1745,14 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(ret); /* - * Release all the links from css_sets to this hierarchy's + * Release all the links from cset_links to this hierarchy's * root cgroup */ write_lock(&css_set_lock); - list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, - cgrp_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); kfree(link); } write_unlock(&css_set_lock); @@ -2908,12 +2912,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->refcount); - } + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); read_unlock(&css_set_lock); return count; } @@ -2922,24 +2925,23 @@ int cgroup_task_count(const struct cgroup *cgrp) * Advance a list_head iterator. The iterator should be positioned at * the start of a css_set */ -static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) { - struct list_head *l = it->cg_link; - struct cg_cgroup_link *link; + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->css_sets) { - it->cg_link = NULL; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; return; } - link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cset = link->cg; + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; } while (list_empty(&cset->tasks)); - it->cg_link = l; + it->cset_link = l; it->task = cset->tasks.next; } @@ -3160,7 +3162,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->cg_link = &cgrp->css_sets; + it->cset_link = &cgrp->cset_links; cgroup_advance_iter(cgrp, it); } @@ -3169,16 +3171,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cg_link) + if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); - if (l == &link->cg->tasks) { + link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); + if (l == &link->cset->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); @@ -4625,7 +4627,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); */ void cgroup_unload_subsys(struct cgroup_subsys *ss) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; BUG_ON(ss->module == NULL); @@ -4654,8 +4656,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * in loading, we need to pay our respects to the hashtable gods. */ write_lock(&css_set_lock); - list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { - struct css_set *cset = link->cg; + list_for_each_entry(link, &dummytop->cset_links, cset_link) { + struct css_set *cset = link->cset; unsigned long key; hash_del(&cset->hlist); @@ -4688,7 +4690,7 @@ int __init cgroup_init_early(void) { int i; atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cg_links); + INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; @@ -4696,12 +4698,10 @@ int __init cgroup_init_early(void) root_count = 1; init_task.cgroups = &init_css_set; - init_css_set_link.cg = &init_css_set; - init_css_set_link.cgrp = dummytop; - list_add(&init_css_set_link.cgrp_link_list, - &rootnode.top_cgroup.css_sets); - list_add(&init_css_set_link.cg_link_list, - &init_css_set.cg_links); + init_cgrp_cset_link.cset = &init_css_set; + init_cgrp_cset_link.cgrp = dummytop; + list_add(&init_cgrp_cset_link.cset_link, &rootnode.top_cgroup.cset_links); + list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -5454,13 +5454,13 @@ static int current_css_set_cg_links_read(struct cgroup *cont, struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; struct css_set *cset; read_lock(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cg_links, cg_link_list) { + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; const char *name; @@ -5481,11 +5481,11 @@ static int cgroup_css_links_read(struct cgroup *cont, struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cset = link->cg; + list_for_each_entry(link, &cont->cset_links, cset_link) { + struct css_set *cset = link->cset; struct task_struct *task; int count = 0; seq_printf(seq, "css_set %p\n", cset); -- cgit v1.2.3 From f4f4be2bd2889c69a8698edef8dbfd4f6759aa87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:51 -0700 Subject: cgroup: use kzalloc() instead of kmalloc() There's no point in using kmalloc() instead of the clearing variant for trivial stuff. We can live dangerously elsewhere. Use kzalloc() instead and drop 0 inits. While at it, do trivial code reorganization in cgroup_file_open(). This patch doesn't introduce any functional changes. v2: I was caught in the very distant past where list_del() didn't poison and the initial version converted list_del()s to list_del_init()s too. Li and Kent took me out of the stasis chamber. Signed-off-by: Tejun Heo Cc: Kent Overstreet Acked-by: Li Zefan --- kernel/cgroup.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef97bd0cd546..d86a8477d56a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -597,7 +597,7 @@ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { - link = kmalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; @@ -658,7 +658,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, if (cset) return cset; - cset = kmalloc(sizeof(*cset), GFP_KERNEL); + cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; @@ -2475,10 +2475,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file) cft = __d_cft(file->f_dentry); if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state = - kzalloc(sizeof(*state), GFP_USER); + struct cgroup_seqfile_state *state; + + state = kzalloc(sizeof(*state), GFP_USER); if (!state) return -ENOMEM; + state->cft = cft; state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; @@ -3511,7 +3513,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, } } /* entry not found; create a new one */ - l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) { mutex_unlock(&cgrp->pidlist_mutex); return l; @@ -3520,8 +3522,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, down_write(&l->mutex); l->key.type = type; l->key.ns = get_pid_ns(ns); - l->use_count = 0; /* don't increment here */ - l->list = NULL; l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); mutex_unlock(&cgrp->pidlist_mutex); -- cgit v1.2.3 From 54766d4a1d3d6f84ff8fa475cd8f165c0a0000eb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:53 -0700 Subject: cgroup: rename CGRP_REMOVED to CGRP_DEAD We will add another flag indicating that the cgroup is in the process of being killed. REMOVING / REMOVED is more difficult to distinguish and cgroup_is_removing()/cgroup_is_removed() are a bit awkward. Also, later percpu_ref usage will involve "kill"ing the refcnt. s/CGRP_REMOVED/CGRP_DEAD/ s/cgroup_is_removed()/cgroup_is_dead() This patch is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d86a8477d56a..84efb344fdf6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,9 +226,9 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -static inline bool cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_REMOVED, &cgrp->flags); + return test_bit(CGRP_DEAD, &cgrp->flags); } /** @@ -300,7 +300,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) static bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { + if (cgroup_is_dead(cgrp)) { mutex_unlock(&cgroup_mutex); return false; } @@ -892,7 +892,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) if (S_ISDIR(inode->i_mode)) { struct cgroup *cgrp = dentry->d_fsdata; - BUG_ON(!(cgroup_is_removed(cgrp))); + BUG_ON(!(cgroup_is_dead(cgrp))); call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); @@ -2363,7 +2363,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_removed(cgrp)) + if (cgroup_is_dead(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -2408,7 +2408,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_removed(cgrp)) + if (cgroup_is_dead(cgrp)) return -ENODEV; if (cft->read) @@ -2831,7 +2831,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (!cgroup_is_removed(cgrp)) + if (!cgroup_is_dead(cgrp)) cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2999,14 +2999,14 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) /* * @pos could already have been removed. Once a cgroup is removed, * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_REMOVED is set on removal which is fully + * changes. As CGRP_DEAD is set on removal which is fully * serialized, if we see it unasserted, it's guaranteed that the * next sibling hasn't finished its grace period even if it's * already removed, and thus safe to dereference from this RCU * critical section. If ->sibling.next is inaccessible, - * cgroup_is_removed() is guaranteed to be visible as %true here. + * cgroup_is_dead() is guaranteed to be visible as %true here. */ - if (likely(!cgroup_is_removed(pos))) { + if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; @@ -4383,7 +4383,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * attempts fail thus maintaining the removal conditions verified * above. * - * Note that CGRP_REMVOED clearing is depended upon by + * Note that CGRP_DEAD assertion is depended upon by * cgroup_next_sibling() to resume iteration after dropping RCU * read lock. See cgroup_next_sibling() for details. */ @@ -4393,7 +4393,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); } - set_bit(CGRP_REMOVED, &cgrp->flags); + set_bit(CGRP_DEAD, &cgrp->flags); /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) @@ -5063,7 +5063,7 @@ static void check_for_release(struct cgroup *cgrp) int need_schedule_work = 0; raw_spin_lock(&release_list_lock); - if (!cgroup_is_removed(cgrp) && + if (!cgroup_is_dead(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; @@ -5209,9 +5209,7 @@ __setup("cgroup_disable=", cgroup_disable); * Functons for CSS ID. */ -/* - *To get ID other than 0, this should be called when !cgroup_is_removed(). - */ +/* to get ID other than 0, this should be called when !cgroup_is_dead() */ unsigned short css_id(struct cgroup_subsys_state *css) { struct css_id *cssid; -- cgit v1.2.3 From ddd69148bdc45e5e3e55bfde3571daecd5a96d75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:54 -0700 Subject: cgroup: drop unnecessary RCU dancing from __put_css_set() __put_css_set() does RCU read access on @cgrp across dropping @cgrp->count so that it can continue accessing @cgrp even if the count reached zero and destruction of the cgroup commenced. Given that both sides - __css_put() and cgroup_destroy_locked() - are cold paths, this is unnecessary. Just making cgroup_destroy_locked() grab css_set_lock while checking @cgrp->count is enough. Remove the RCU read locking from __put_css_set() and make cgroup_destroy_locked() read-lock css_set_lock when checking @cgrp->count. This will also allow removing @cgrp->count. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 84efb344fdf6..1a68241ca835 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -407,19 +407,13 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* - * We may not be holding cgroup_mutex, and if cgrp->count is - * dropped to 0 the cgroup can be destroyed at any time, hence - * rcu_read_lock is used to keep it alive. - */ - rcu_read_lock(); + /* @cgrp can't go away while we're holding css_set_lock */ if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - rcu_read_unlock(); kfree(link); } @@ -4370,11 +4364,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup *parent = cgrp->parent; struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; + bool empty; lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) + /* + * css_set_lock prevents @cgrp from being removed while + * __put_css_set() is in progress. + */ + read_lock(&css_set_lock); + empty = !atomic_read(&cgrp->count) && list_empty(&cgrp->children); + read_unlock(&css_set_lock); + if (!empty) return -EBUSY; /* @@ -5051,8 +5053,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) static void check_for_release(struct cgroup *cgrp) { - /* All of these checks rely on RCU to keep the cgroup - * structure alive */ if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { /* -- cgit v1.2.3 From 6f3d828f0fb7fdaffc6f32cb8a1cb7fcf8824598 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:55 -0700 Subject: cgroup: remove cgroup->count and use cgroup->count tracks the number of css_sets associated with the cgroup and used only to verify that no css_set is associated when the cgroup is being destroyed. It's superflous as the destruction path can simply check whether cgroup->cset_links is empty instead. Drop cgroup->count and check ->cset_links directly from cgroup_destroy_locked(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a68241ca835..49bfd7b0bbda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -408,8 +408,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_lock */ - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { + if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -616,7 +615,6 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; - atomic_inc(&cgrp->count); list_move(&link->cset_link, &cgrp->cset_links); /* * Always add links to the tail of the list so that the list @@ -4370,11 +4368,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock prevents @cgrp from being removed while - * __put_css_set() is in progress. + * css_set_lock synchronizes access to ->cset_links and prevents + * @cgrp from being removed while __put_css_set() is in progress. */ read_lock(&css_set_lock); - empty = !atomic_read(&cgrp->count) && list_empty(&cgrp->children); + empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); read_unlock(&css_set_lock); if (!empty) return -EBUSY; @@ -5054,7 +5052,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -5422,11 +5420,6 @@ static void debug_css_free(struct cgroup *cont) kfree(cont->subsys[debug_subsys_id]); } -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) { return cgroup_task_count(cont); @@ -5507,10 +5500,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) } static struct cftype debug_files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, { .name = "taskcount", .read_u64 = debug_taskcount_read, -- cgit v1.2.3 From 455050d23e1bfc47ca98e943ad5b2f3a9bbe45fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:27:41 -0700 Subject: cgroup: reorder the operations in cgroup_destroy_locked() This patch reorders the operations in cgroup_destroy_locked() such that the userland visible parts happen before css offlining and removal from the ->sibling list. This will be used to make css use percpu refcnt. While at it, split out CGRP_DEAD related comment from the refcnt deactivation one and correct / clarify how different guarantees are met. While this patch changes the specific order of operations, it shouldn't cause any noticeable behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 61 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49bfd7b0bbda..5a1ddecc3cfa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4379,13 +4379,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Block new css_tryget() by deactivating refcnt and mark @cgrp - * removed. This makes future css_tryget() and child creation - * attempts fail thus maintaining the removal conditions verified - * above. - * - * Note that CGRP_DEAD assertion is depended upon by - * cgroup_next_sibling() to resume iteration after dropping RCU - * read lock. See cgroup_next_sibling() for details. + * removed. This makes future css_tryget() attempts fail which we + * guarantee to ->css_offline() callbacks. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -4393,8 +4388,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); } + + /* + * Mark @cgrp dead. This prevents further task migration and child + * creation by disabling cgroup_lock_live_group(). Note that + * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * resume iteration after dropping RCU read lock. See + * cgroup_next_sibling() for details. + */ set_bit(CGRP_DEAD, &cgrp->flags); + /* CGRP_DEAD is set, remove from ->release_list for the last time */ + raw_spin_lock(&release_list_lock); + if (!list_empty(&cgrp->release_list)) + list_del_init(&cgrp->release_list); + raw_spin_unlock(&release_list_lock); + + /* + * Remove @cgrp directory. The removal puts the base ref but we + * aren't quite done with @cgrp yet, so hold onto it. + */ + dget(d); + cgroup_d_remove_dir(d); + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); @@ -4409,34 +4437,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); - raw_spin_lock(&release_list_lock); - if (!list_empty(&cgrp->release_list)) - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); list_del_init(&cgrp->allcg_node); - dget(d); - cgroup_d_remove_dir(d); dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; } -- cgit v1.2.3 From ea15f8ccdb430af1e8bc9b4e19a230eb4c356777 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:27:42 -0700 Subject: cgroup: split cgroup destruction into two steps Split cgroup_destroy_locked() into two steps and put the latter half into cgroup_offline_fn() which is executed from a work item. The latter half is responsible for offlining the css's, removing the cgroup from internal lists, and propagating release notification to the parent. The separation is to allow using percpu refcnt for css. Note that this allows for other cgroup operations to happen between the first and second halves of destruction, including creating a new cgroup with the same name. As the target cgroup is marked DEAD in the first half and cgroup internals don't care about the names of cgroups, this should be fine. A comment explaining this will be added by the next patch which implements the actual percpu refcnting. As RCU freeing is guaranteed to happen after the second step of destruction, we can use the same work item for both. This patch renames cgroup->free_work to ->destroy_work and uses it for both purposes. INIT_WORK() is now performed right before queueing the work item. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a1ddecc3cfa..df6814706cca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,7 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; */ static int need_forkexit_callback __read_mostly; +static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); @@ -830,7 +831,7 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); @@ -875,7 +876,8 @@ static void cgroup_free_rcu(struct rcu_head *head) { struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - schedule_work(&cgrp->free_work); + INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); + schedule_work(&cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -1407,7 +1409,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); - INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -2991,12 +2992,13 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) /* * @pos could already have been removed. Once a cgroup is removed, * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD is set on removal which is fully - * serialized, if we see it unasserted, it's guaranteed that the - * next sibling hasn't finished its grace period even if it's - * already removed, and thus safe to dereference from this RCU - * critical section. If ->sibling.next is inaccessible, - * cgroup_is_dead() is guaranteed to be visible as %true here. + * changes. As CGRP_DEAD assertion is serialized and happens + * before the cgroup is taken off the ->sibling list, if we see it + * unasserted, it's guaranteed that the next sibling hasn't + * finished its grace period even if it's already removed, and thus + * safe to dereference from this RCU critical section. If + * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed + * to be visible as %true here. */ if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); @@ -4359,7 +4361,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup *parent = cgrp->parent; struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; bool empty; @@ -4423,6 +4424,21 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } spin_unlock(&cgrp->event_list_lock); + INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); + schedule_work(&cgrp->destroy_work); + + return 0; +}; + +static void cgroup_offline_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); + struct cgroup *parent = cgrp->parent; + struct dentry *d = cgrp->dentry; + struct cgroup_subsys *ss; + + mutex_lock(&cgroup_mutex); + /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); @@ -4446,7 +4462,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - return 0; + mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit v1.2.3 From d3daf28da16a30af95bfb303189a634a87606725 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:39:16 -0700 Subject: cgroup: use percpu refcnt for cgroup_subsys_states A css (cgroup_subsys_state) is how each cgroup is represented to a controller. As such, it can be used in hot paths across the various subsystems different controllers are associated with. One of the common operations is reference counting, which up until now has been implemented using a global atomic counter and can have significant adverse impact on scalability. For example, css refcnt can be gotten and put multiple times by blkcg for each IO request. For highops configurations which try to do as much per-cpu as possible, the global frequent refcnting can be very expensive. In general, given the various and hugely diverse paths css's end up being used from, we need to make it cheap and highly scalable. In its usage, css refcnting isn't very different from module refcnting. This patch converts css refcnting to use the recently added percpu_ref. css_get/tryget/put() directly maps to the matching percpu_ref operations and the deactivation logic is no longer necessary as percpu_ref already has refcnt killing. The only complication is that as the refcnt is per-cpu, percpu_ref_kill() in itself doesn't ensure that further tryget operations will fail, which we need to guarantee before invoking ->css_offline()'s. This is resolved collecting kill confirmation using percpu_ref_kill_and_confirm() and initiating the offline phase of destruction after all css refcnt's are confirmed to be seen as killed on all CPUs. The previous patches already splitted destruction into two phases, so percpu_ref_kill_and_confirm() can be hooked up easily. This patch removes css_refcnt() which is used for rcu dereference sanity check in css_id(). While we can add a percpu refcnt API to ask the same question, css_id() itself is scheduled to be removed fairly soon, so let's not bother with it. Just drop the sanity check and use rcu_dereference_raw() instead. v2: - init_cgroup_css() was calling percpu_ref_init() without checking the return value. This causes two problems - the obvious lack of error handling and percpu_ref_init() being called from cgroup_init_subsys() before the allocators are up, which triggers warnings but doesn't cause actual problems as the refcnt isn't used for roots anyway. Fix both by moving percpu_ref_init() to cgroup_create(). - The base references were put too early by percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the refs one extra time. This wasn't noticeable because css's go through another RCU grace period before being freed. Update cgroup_destroy_locked() to grab an extra reference before killing the refcnts. This problem was noticed by Kent. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Acked-by: Li Zefan Cc: Michal Hocko Cc: Mike Snitzer Cc: Vivek Goyal Cc: "Alasdair G. Kergon" Cc: Jens Axboe Cc: Mikulas Patocka Cc: Glauber Costa --- kernel/cgroup.c | 165 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 104 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ebbfc043153f..2e9da7bf25cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,9 +63,6 @@ #include -/* css deactivation bias, makes css->refcnt negative to deny new trygets */ -#define CSS_DEACT_BIAS INT_MIN - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -static int css_unbias_refcnt(int refcnt) -{ - return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -} - -/* the current nr of refs, always >= 0 whether @css is deactivated or not */ -static int css_refcnt(struct cgroup_subsys_state *css) -{ - int v = atomic_read(&css->refcnt); - - return css_unbias_refcnt(v); -} - /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work) deactivate_super(sb); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + schedule_work(&css->dput_work); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); css->flags = 0; css->id = NULL; if (cgrp == dummytop) @@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free_all; + init_cgroup_css(css, ss, cgrp); + if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) @@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + if (css) { + percpu_ref_cancel_init(&css->refcnt); ss->css_free(cgrp); + } } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ @@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +static void cgroup_css_killed(struct cgroup *cgrp) +{ + if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) + return; + + /* percpu ref's of all css's are killed, kick off the next step */ + INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); + schedule_work(&cgrp->destroy_work); +} + +static void css_ref_killed_fn(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + cgroup_css_killed(css->cgroup); +} + +/** + * cgroup_destroy_locked - the first stage of cgroup destruction + * @cgrp: cgroup to be destroyed + * + * css's make use of percpu refcnts whose killing latency shouldn't be + * exposed to userland and are RCU protected. Also, cgroup core needs to + * guarantee that css_tryget() won't succeed by the time ->css_offline() is + * invoked. To satisfy all the requirements, destruction is implemented in + * the following two steps. + * + * s1. Verify @cgrp can be destroyed and mark it dying. Remove all + * userland visible parts and start killing the percpu refcnts of + * css's. Set up so that the next stage will be kicked off once all + * the percpu refcnts are confirmed to be killed. + * + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the + * rest of destruction. Once all cgroup references are gone, the + * cgroup is RCU-freed. + * + * This function implements s1. After this step, @cgrp is gone as far as + * the userland is concerned and a new cgroup with the same name may be + * created. As cgroup doesn't care about the names internally, this + * doesn't cause any problem. + */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { @@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt and mark @cgrp - * removed. This makes future css_tryget() attempts fail which we - * guarantee to ->css_offline() callbacks. + * Block new css_tryget() by killing css refcnts. cgroup core + * guarantees that, by the time ->css_offline() is invoked, no new + * css reference will be given out via css_tryget(). We can't + * simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen + * as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. The + * notification callback keeps track of the number of css's to be + * killed and schedules cgroup_offline_fn() to perform the rest of + * destruction once the percpu refs of all css's are confirmed to + * be killed. */ + atomic_set(&cgrp->css_kill_cnt, 1); for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); + /* + * Killing would put the base ref, but we need to keep it + * alive until after ->css_offline. + */ + percpu_ref_get(&css->refcnt); + + atomic_inc(&cgrp->css_kill_cnt); + percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); } + cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } spin_unlock(&cgrp->event_list_lock); - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); - return 0; }; +/** + * cgroup_offline_fn - the second step of cgroup destruction + * @work: cgroup->destroy_free_work + * + * This function is invoked from a work item for a cgroup which is being + * destroyed after the percpu refcnts of all css's are guaranteed to be + * seen as killed on all CPUs, and performs the rest of destruction. This + * is the second step of destruction described in the comment above + * cgroup_destroy_locked(). + */ static void cgroup_offline_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); @@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); - /* tell subsystems to initate destruction */ + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); /* - * Put all the base refs. Each css holds an extra reference to the - * cgroup's dentry and cgroup removal proceeds regardless of css - * refs. On the last put of each css, whenever that may be, the - * extra dentry ref is put so that dentry destruction happens only - * after all css's are released. + * Put the css refs from cgroup_destroy_locked(). Each css holds + * an extra reference to the cgroup's dentry and cgroup removal + * proceeds regardless of css refs. On the last put of each css, + * whenever that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. */ for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); @@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp) } } -/* Caller must verify that the css is not for root cgroup */ -bool __css_tryget(struct cgroup_subsys_state *css) -{ - while (true) { - int t, v; - - v = css_refcnt(css); - t = atomic_cmpxchg(&css->refcnt, v, v + 1); - if (likely(t == v)) - return true; - else if (t < 0) - return false; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(__css_tryget); - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css) -{ - int v; - - v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - if (v == 0) - schedule_work(&css->dput_work); -} -EXPORT_SYMBOL_GPL(__css_put); - /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path @@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, css_refcnt(css)); + cssid = rcu_dereference_raw(css->id); if (cssid) return cssid->id; -- cgit v1.2.3 From c9e5fe66f5947c9e56dfc7655e5b4b127ca2120f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jun 2013 11:18:27 +0800 Subject: cpuset: rename @cont to @cgrp Cont is short for container. control group was named process container at first, but then people found container already has a meaning in linux kernel. Clean up the leftover variable name @cont. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c17d96bd3a5..654c95979028 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -116,9 +116,9 @@ struct cpuset { }; /* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cont) +static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), + return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), struct cpuset, css); } @@ -433,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial) static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { - struct cgroup *cont; + struct cgroup *cgrp; struct cpuset *c, *par; int ret; @@ -441,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cont, cur) + cpuset_for_each_child(c, cgrp, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -462,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cont, par) { + cpuset_for_each_child(c, cgrp, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -1759,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cont, +static ssize_t cpuset_common_file_read(struct cgroup *cgrp, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1795,9 +1795,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1826,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1940,14 +1940,14 @@ static struct cftype files[] = { /* * cpuset_css_alloc - allocate a cpuset css - * cont: control group that the new cpuset will be part of + * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) { struct cpuset *cs; - if (!cont->parent) + if (!cgrp->parent) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -2042,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) * will call rebuild_sched_domains_locked(). */ -static void cpuset_css_free(struct cgroup *cont) +static void cpuset_css_free(struct cgroup *cgrp) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = cgroup_cs(cgrp); free_cpumask_var(cs->cpus_allowed); kfree(cs); -- cgit v1.2.3 From c5cdc67a58a22c49f558b450c6f748251ceb2e7b Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 13 Jun 2013 22:19:43 +0000 Subject: irqdomain: Remove temporary MIPS workaround code The MIPS interrupt controllers are all registering their own irq_domains now. Drop the MIPS specific code because it is no longer needed. Signed-off-by: Grant Likely Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/5458/ Signed-off-by: Ralf Baechle --- kernel/irq/irqdomain.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 54a4d5223238..a341b3d433ad 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -665,18 +665,6 @@ unsigned int irq_create_of_mapping(struct device_node *controller, domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { -#ifdef CONFIG_MIPS - /* - * Workaround to avoid breaking interrupt controller drivers - * that don't yet register an irq_domain. This is temporary - * code. ~~~gcl, Feb 24, 2012 - * - * Scheduled for removal in Linux v3.6. That should be enough - * time. - */ - if (intsize > 0) - return intspec[0]; -#endif pr_warning("no irq domain found for %s !\n", of_node_full_name(controller)); return 0; -- cgit v1.2.3 From 336ae1180df5f69b9e0fb6561bec01c5f64361cf Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Jun 2013 15:40:58 -0700 Subject: ARM: sched_clock: Load cycle count after epoch stabilizes There is a small race between when the cycle count is read from the hardware and when the epoch stabilizes. Consider this scenario: CPU0 CPU1 ---- ---- cyc = read_sched_clock() cyc_to_sched_clock() update_sched_clock() ... cd.epoch_cyc = cyc; epoch_cyc = cd.epoch_cyc; ... epoch_ns + cyc_to_ns((cyc - epoch_cyc) The cyc on cpu0 was read before the epoch changed. But we calculate the nanoseconds based on the new epoch by subtracting the new epoch from the old cycle count. Since epoch is most likely larger than the old cycle count we calculate a large number that will be converted to nanoseconds and added to epoch_ns, causing time to jump forward too much. Fix this problem by reading the hardware after the epoch has stabilized. Cc: Russell King Signed-off-by: Stephen Boyd Signed-off-by: John Stultz --- kernel/time/sched_clock.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index aad1ae6077ef..a326f27d7f09 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,10 +49,14 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) +static unsigned long long notrace sched_clock_32(void) { u64 epoch_ns; u32 epoch_cyc; + u32 cyc; + + if (cd.suspended) + return cd.epoch_ns; /* * Load the epoch_cyc and epoch_ns atomically. We do this by @@ -68,7 +72,9 @@ static unsigned long long notrace cyc_to_sched_clock(u32 cyc, u32 mask) smp_rmb(); } while (epoch_cyc != cd.epoch_cyc_copy); - return epoch_ns + cyc_to_ns((cyc - epoch_cyc) & mask, cd.mult, cd.shift); + cyc = read_sched_clock(); + cyc = (cyc - epoch_cyc) & sched_clock_mask; + return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); } /* @@ -160,19 +166,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -static unsigned long long notrace sched_clock_32(void) -{ - u32 cyc = read_sched_clock(); - return cyc_to_sched_clock(cyc, sched_clock_mask); -} - unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; unsigned long long notrace sched_clock(void) { - if (cd.suspended) - return cd.epoch_ns; - return sched_clock_func(); } -- cgit v1.2.3 From 37074c5a1b9979d05b9effc7634385fc0fa7ccc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 12 Jun 2013 14:24:12 +0200 Subject: irq/generic-chip: fix a few kernel-doc entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- kernel/irq/generic-chip.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f668..b34e7267b817 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -48,7 +48,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) } /** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * irq_gc_mask_set_bit - Mask irq via setting bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -66,7 +66,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) } /** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) } /** - * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt * @d: irq_data */ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) -- cgit v1.2.3 From 6db8e85c5c1f89cd0183b76dab027c81009f129f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Jun 2013 11:18:22 -0700 Subject: cgroup: disallow rename(2) if sane_behavior cgroup's rename(2) isn't a proper migration implementation - it can't move the cgroup to a different parent in the hierarchy. All it can do is swapping the name string for that cgroup. This isn't useful and can mislead users to think that cgroup supports proper cgroup-level migration. Disallow rename(2) if sane_behavior. v2: Fail with -EPERM instead of -EINVAL so that it matches the vfs return value when ->rename is not implemented as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2e9da7bf25cb..c2c64005bbc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2508,6 +2508,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, cgrp = __d_cgrp(old_dentry); + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow if sane_behavior. + */ + if (cgroup_sane_behavior(cgrp)) + return -EPERM; + name = cgroup_alloc_name(new_dentry); if (!name) return -ENOMEM; -- cgit v1.2.3 From 084457f284abf6789d90509ee11dae383842b23b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:40:19 +0800 Subject: cgroup: fix umount vs cgroup_cfts_commit() race cgroup_cfts_commit() uses dget() to keep cgroup alive after cgroup_mutex is dropped, but dget() won't prevent cgroupfs from being umounted. When the race happens, vfs will see some dentries with non-zero refcnt while umount is in process. Keep running this: mount -t cgroup -o blkio xxx /cgroup umount /cgroup And this: modprobe cfq-iosched rmmod cfs-iosched After a while, the BUG() in shrink_dcache_for_umount_subtree() may be triggered: BUG: Dentry xxx{i=0,n=blkio.yyy} still in use (1) [umount of cgroup cgroup] Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c2c64005bbc2..0224f6b3103e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2798,13 +2798,17 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, { LIST_HEAD(pending); struct cgroup *cgrp, *n; + struct super_block *sb = ss->root->sb; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (cfts && ss->root != &rootnode) { + if (cfts && ss->root != &rootnode && + atomic_inc_not_zero(sb->s_active)) { list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { dget(cgrp->dentry); list_add_tail(&cgrp->cft_q_node, &pending); } + } else { + sb = NULL; } mutex_unlock(&cgroup_mutex); @@ -2827,6 +2831,9 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, dput(cgrp->dentry); } + if (sb) + deactivate_super(sb); + mutex_unlock(&cgroup_cft_mutex); } -- cgit v1.2.3 From 1c8158eeae0f37d0eee9f1fbe68080df6a408df2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:41:10 +0800 Subject: cgroup: fix umount vs cgroup_event_remove() race commit 5db9a4d99b0157a513944e9a44d29c9cec2e91dc Author: Tejun Heo Date: Sat Jul 7 16:08:18 2012 -0700 cgroup: fix cgroup hierarchy umount race This commit fixed a race caused by the dput() in css_dput_fn(), but the dput() in cgroup_event_remove() can also lead to the same BUG(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0224f6b3103e..7db2940bfc77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3821,6 +3821,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, return 0; } +/* + * When dput() is called asynchronously, if umount has been done and + * then deactivate_super() in cgroup_free_fn() kills the superblock, + * there's a small window that vfs will see the root dentry with non-zero + * refcnt and trigger BUG(). + * + * That's why we hold a reference before dput() and drop it right after. + */ +static void cgroup_dput(struct cgroup *cgrp) +{ + struct super_block *sb = cgrp->root->sb; + + atomic_inc(&sb->s_active); + dput(cgrp->dentry); + deactivate_super(sb); +} + /* * Unregister event and free resources. * @@ -3841,7 +3858,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); - dput(cgrp->dentry); + cgroup_dput(cgrp); } /* @@ -4129,12 +4146,8 @@ static void css_dput_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, dput_work); - struct dentry *dentry = css->cgroup->dentry; - struct super_block *sb = dentry->d_sb; - atomic_inc(&sb->s_active); - dput(dentry); - deactivate_super(sb); + cgroup_dput(css->cgroup); } static void css_release(struct percpu_ref *ref) -- cgit v1.2.3 From f57947d27711451a7739a25bba6cddc8a385e438 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:41:53 +0800 Subject: cgroup: fix memory leak in cgroup_rm_cftypes() The memory allocated in cgroup_add_cftypes() should be freed. The effect of this bug is we leak a bit memory everytime we unload cfq-iosched module if blkio cgroup is enabled. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7db2940bfc77..1d4f471de8d5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2889,7 +2889,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) list_for_each_entry(set, &ss->cftsets, node) { if (set->cfts == cfts) { - list_del_init(&set->node); + list_del(&set->node); + kfree(set); cgroup_cfts_commit(ss, cfts, false); return 0; } -- cgit v1.2.3 From 794611a1dfcb055d7d41ce133378dd8197d73e38 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:53:53 +0800 Subject: cgroup: make serial_nr_cursor available throughout cgroup.c The next patch will use it to determine if a cgroup is newly created while we're iterating the cgroup hierarchy. tj: Rephrased the comment on top of cgroup_serial_nr_cursor. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d4f471de8d5..e6571ca822a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -198,6 +198,15 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static struct cgroup_name root_cgroup_name = { .name = "/" }; +/* + * Assign a monotonically increasing serial number to cgroups. It + * guarantees cgroups with bigger numbers are newer than those with smaller + * numbers. Also, as cgroups are always appended to the parent's + * ->children list, it guarantees that sibling cgroups are always sorted in + * the ascending serial number order on the list. + */ +static atomic64_t cgroup_serial_nr_cursor = ATOMIC64_INIT(0); + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -4222,7 +4231,6 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { - static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4309,13 +4317,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); - /* - * Assign a monotonically increasing serial number. With the list - * appending below, it guarantees that sibling cgroups are always - * sorted in the ascending serial number order on the parent's - * ->children. - */ - cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + cgrp->serial_nr = atomic64_inc_return(&cgroup_serial_nr_cursor); /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); -- cgit v1.2.3 From e8c82d20a9f729cf4b9f73043f7fd4e0872bebfd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:48:37 +0800 Subject: cgroup: convert cgroup_cft_commit() to use cgroup_for_each_descendant_pre() We used root->allcg_list to iterate cgroup hierarchy because at that time cgroup_for_each_descendant_pre() hasn't been invented. tj: In cgroup_cfts_commit(), s/@serial_nr/@update_upto/, move the assignment right above releasing cgroup_mutex and explain what's going on there. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 80 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e6571ca822a0..0ed7d8db6508 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1399,7 +1399,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1414,12 +1413,10 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); - INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); - list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static int cgroup_init_root_id(struct cgroupfs_root *root) @@ -2785,65 +2782,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, return ret; } -static DEFINE_MUTEX(cgroup_cft_mutex); - static void cgroup_cfts_prepare(void) - __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) + __acquires(&cgroup_mutex) { /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we increment reference on all cgroups and build list of - * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure - * exclusive access to the field. + * Instead, we use cgroup_for_each_descendant_pre() and drop RCU + * read lock before calling cgroup_addrm_files(). */ - mutex_lock(&cgroup_cft_mutex); mutex_lock(&cgroup_mutex); } static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) + __releases(&cgroup_mutex) { LIST_HEAD(pending); - struct cgroup *cgrp, *n; + struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; + struct dentry *prev = NULL; + struct inode *inode; + u64 update_upto; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (cfts && ss->root != &rootnode && - atomic_inc_not_zero(sb->s_active)) { - list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { - dget(cgrp->dentry); - list_add_tail(&cgrp->cft_q_node, &pending); - } - } else { - sb = NULL; + if (!cfts || ss->root == &rootnode || + !atomic_inc_not_zero(&sb->s_active)) { + mutex_unlock(&cgroup_mutex); + return; } - mutex_unlock(&cgroup_mutex); - /* - * All new cgroups will see @cfts update on @ss->cftsets. Add/rm - * files for all cgroups which were created before. + * All cgroups which are created after we drop cgroup_mutex will + * have the updated set of files, so we only need to update the + * cgroups created before the current @cgroup_serial_nr_cursor. */ - list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { - struct inode *inode = cgrp->dentry->d_inode; + update_upto = atomic64_read(&cgroup_serial_nr_cursor); + + mutex_unlock(&cgroup_mutex); + + /* @root always needs to be updated */ + inode = root->dentry->d_inode; + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + cgroup_addrm_files(root, ss, cfts, is_add); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + + /* add/rm files for all cgroups created before */ + rcu_read_lock(); + cgroup_for_each_descendant_pre(cgrp, root) { + if (cgroup_is_dead(cgrp)) + continue; + + inode = cgrp->dentry->d_inode; + dget(cgrp->dentry); + rcu_read_unlock(); + + dput(prev); + prev = cgrp->dentry; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (!cgroup_is_dead(cgrp)) + if (cgrp->serial_nr <= update_upto && !cgroup_is_dead(cgrp)) cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - list_del_init(&cgrp->cft_q_node); - dput(cgrp->dentry); + rcu_read_lock(); } - - if (sb) - deactivate_super(sb); - - mutex_unlock(&cgroup_cft_mutex); + rcu_read_unlock(); + dput(prev); + deactivate_super(sb); } /** @@ -4320,7 +4330,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->serial_nr = atomic64_inc_return(&cgroup_serial_nr_cursor); /* allocation complete, commit to creation */ - list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; @@ -4559,7 +4568,6 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - list_del_init(&cgrp->allcg_node); dput(d); -- cgit v1.2.3 From 00356bd5f0f5e04183fb15805eb29e97c2fc20ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2013 11:14:22 -0700 Subject: cgroup: clean up cgroup_serial_nr_cursor cgroup_serial_nr_cursor was created atomic64_t because I thought it was never gonna used for anything other than assigning unique numbers to cgroups and didn't want to worry about synchronization; however, now we're using it as an event-stamp to distinguish cgroups created before and after certain point which assumes that it's protected by cgroup_mutex. Let's make it clear by making it a u64. Also, rename it to cgroup_serial_nr_next and make it point to the next nr to allocate so that where it's pointing to is clear and more conventional. Signed-off-by: Tejun Heo Cc: Li Zefan --- kernel/cgroup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0ed7d8db6508..65f333ebb572 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -203,9 +203,10 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; * guarantees cgroups with bigger numbers are newer than those with smaller * numbers. Also, as cgroups are always appended to the parent's * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. + * the ascending serial number order on the list. Protected by + * cgroup_mutex. */ -static atomic64_t cgroup_serial_nr_cursor = ATOMIC64_INIT(0); +static u64 cgroup_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -2803,7 +2804,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; - u64 update_upto; + u64 update_before; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &rootnode || @@ -2815,9 +2816,9 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, /* * All cgroups which are created after we drop cgroup_mutex will * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_cursor. + * cgroups created before the current @cgroup_serial_nr_next. */ - update_upto = atomic64_read(&cgroup_serial_nr_cursor); + update_before = cgroup_serial_nr_next; mutex_unlock(&cgroup_mutex); @@ -2844,7 +2845,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (cgrp->serial_nr <= update_upto && !cgroup_is_dead(cgrp)) + if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -4327,7 +4328,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); - cgrp->serial_nr = atomic64_inc_return(&cgroup_serial_nr_cursor); + cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); -- cgit v1.2.3 From 03c78cbebb323fc97295ff97dc5e009d56371d57 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jun 2013 11:17:19 +0800 Subject: cgroup: rename cont to cgrp Cont is short for container. control group was named process container at first, but then people found container already has a meaning in linux kernel. Clean up the leftover variable name @cont. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 65f333ebb572..1051c1f69674 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5515,7 +5515,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5525,23 +5525,23 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) return css; } -static void debug_css_free(struct cgroup *cont) +static void debug_css_free(struct cgroup *cgrp) { - kfree(cont->subsys[debug_subsys_id]); + kfree(cgrp->subsys[debug_subsys_id]); } -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) { - return cgroup_task_count(cont); + return cgroup_task_count(cgrp); } -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) +static u64 current_css_set_refcount_read(struct cgroup *cgrp, + struct cftype *cft) { u64 count; @@ -5551,7 +5551,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, return count; } -static int current_css_set_cg_links_read(struct cgroup *cont, +static int current_css_set_cg_links_read(struct cgroup *cgrp, struct cftype *cft, struct seq_file *seq) { @@ -5578,14 +5578,14 @@ static int current_css_set_cg_links_read(struct cgroup *cont, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, +static int cgroup_css_links_read(struct cgroup *cgrp, struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cont->cset_links, cset_link) { + list_for_each_entry(link, &cgrp->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; -- cgit v1.2.3 From e712209a9e0b70e78b13847738eb66fe37412515 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 6 Jun 2013 11:02:04 +0200 Subject: perf: Fix hypervisor branch sampling permission check Commit 2b923c8 perf/x86: Check branch sampling priv level in generic code was missing the check for the hypervisor (HV) priv level, so add it back. With this patch, we get the following correct behavior: # echo 2 >/proc/sys/kernel/perf_event_paranoid $ perf record -j any,k noploop 1 Error: You may not have permission to collect stats. Consider tweaking /proc/sys/kernel/perf_event_paranoid: -1 - Not paranoid at all 0 - Disallow raw tracepoint access for unpriv 1 - Disallow cpu events for unpriv 2 - Disallow kernel profiling for unpriv $ perf record -j any,hv noploop 1 Error: You may not have permission to collect stats. Consider tweaking /proc/sys/kernel/perf_event_paranoid: -1 - Not paranoid at all 0 - Disallow raw tracepoint access for unpriv 1 - Disallow cpu events for unpriv 2 - Disallow kernel profiling for unpriv Signed-off-by: Stephane Eranian Acked-by: Petr Matousek Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130606090204.GA3725@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d0e0d0d2025f..aca95bce34c8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6573,8 +6573,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, */ attr->branch_sample_type = mask; } - /* kernel level capture: check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_KERNEL) + /* privileged levels capture (kernel, hv): check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) return -EACCES; } -- cgit v1.2.3 From 03d8e80beb7db78a13c192431205b9c83f7e0cd1 Mon Sep 17 00:00:00 2001 From: Mischa Jonker Date: Tue, 4 Jun 2013 11:45:48 +0200 Subject: perf: Add const qualifier to perf_pmu_register's 'name' arg This allows us to use pdev->name for registering a PMU device. IMO the name is not supposed to be changed anyway. Signed-off-by: Mischa Jonker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1370339148-5566-1-git-send-email-mjonker@synopsys.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index aca95bce34c8..9c8920783317 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6179,7 +6179,7 @@ free_dev: static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, char *name, int type) +int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret; -- cgit v1.2.3 From e23ee74777f389369431d77390c4b09332ce026a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 7 Jun 2013 15:37:43 -0400 Subject: sched/rt: Simplify pull_rt_task() logic and remove .leaf_rt_rq_list [ Peter, this is based off of some of my work, I ran it though a few tests and it passed. I also reviewed it, and added my SOB as I am somewhat a co-author to it. ] Based on the patch by Steven Rostedt from previous year: https://lkml.org/lkml/2012/4/18/517 1)Simplify pull_rt_task() logic: search in pushable tasks of dest runqueue. The only pullable tasks are the tasks which are pushable in their local rq, and no others. 2)Remove .leaf_rt_rq_list member of struct rt_rq and functions connected with it: nobody uses it since now. Signed-off-by: Kirill Tkhai Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/287571370557898@web7d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 82 ++++++++++------------------------------------------ kernel/sched/sched.h | 1 - 2 files changed, 16 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8d85f9ac4262..01970c8e64df 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) (iter = next_task_group(iter)) && \ (rt_rq = iter->rt_rq[cpu_of(rq)]);) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_add_rcu(&rt_rq->leaf_rt_rq_list, - &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t; #define for_each_rt_rq(rt_rq, iter, rq) \ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = NULL) @@ -1066,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - if (!rt_rq->rt_nr_running) - list_add_leaf_rt_rq(rt_rq); - if (head) list_add(&rt_se->run_list, queue); else @@ -1088,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); - if (!rt_rq->rt_nr_running) - list_del_leaf_rt_rq(rt_rq); } /* @@ -1394,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +/* + * Return the highest pushable rq's task, which is suitable to be executed + * on the cpu, NULL otherwise + */ +static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; - - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); -next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio <= idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *p; - if (!rt_entity_is_task(rt_se)) - continue; + if (!has_pushable_tasks(rq)) + return NULL; - p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } + plist_for_each_entry(p, head, pushable_tasks) { + if (pick_rt_task(rq, p, cpu)) + return p; } - return next; + return NULL; } static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1703,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq) double_lock_balance(this_rq, src_rq); /* - * Are there still pullable RT tasks? + * We can pull only a task, which is pushable + * on its rq, and no others. */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); + p = pick_highest_pushable_task(src_rq, this_cpu); /* * Do we have an RT task that preempts diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 74ff659e964f..029601a61587 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -361,7 +361,6 @@ struct rt_rq { unsigned long rt_nr_boosted; struct rq *rq; - struct list_head leaf_rt_rq_list; struct task_group *tg; #endif }; -- cgit v1.2.3 From 22b958d8cc5127d22d2ad2141277d312d93fad6c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 4 Jun 2013 14:23:39 +0800 Subject: sched: Refine the code in unthrottle_cfs_rq() Directly use rq to save some code. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51AD87EB.1070605@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 143dcdbc47af..47a30be1fe83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2315,7 +2315,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) int enqueue = 1; long task_delta; - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + se = cfs_rq->tg->se[cpu_of(rq)]; cfs_rq->throttled = 0; -- cgit v1.2.3 From 8404c90d050733b3404dc36c500f63ccb0c972ce Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 4 Jun 2013 14:24:08 +0800 Subject: sched: Femove the useless declaration in kernel/sched/fair.c default_cfs_period(), do_sched_cfs_period_timer(), do_sched_cfs_slack_timer() already defined previously, no need to declare again. Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51AD8808.7020608@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a30be1fe83..c0ac2c3b56e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2618,10 +2618,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = -- cgit v1.2.3 From 0a0fca9d832b704f116a25badd1ca8c16771dcac Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 13:10:24 +0530 Subject: sched: Rename sched.c as sched/core.c in comments and Documentation Most of the stuff from kernel/sched.c was moved to kernel/sched/core.c long time back and the comments/Documentation never got updated. I figured it out when I was going through sched-domains.txt and so thought of fixing it globally. I haven't crossed check if the stuff that is referenced in sched/core.c by all these files is still present and hasn't changed as that wasn't the motive behind this patch. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/cdff76a265326ab8d71922a1db5be599f20aad45.1370329560.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 4 ++-- kernel/time.c | 2 +- kernel/workqueue_internal.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe5..902d13fc2b13 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c + * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. @@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a + * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) diff --git a/kernel/time.c b/kernel/time.c index d3617dbd3dca..7c7964c33ae7 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -11,7 +11,7 @@ * Modification history kernel/time.c * * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() + * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index ad83c96b2ece..7e2204db0b1a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void) /* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched.c and workqueue.c. + * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); -- cgit v1.2.3 From 4a850cbefa9592ddde3670a41c10c9576a657c43 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 16:12:43 +0530 Subject: sched: Remove unused params of build_sched_domain() build_sched_domain() never uses parameter struct s_data *d and so passing it is useless. Remove it. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/545e0b4536166a15b4475abcafe5ed0db4ad4a2c.1370436120.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8f071cc9f51..342e74419f8f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5943,9 +5943,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *child, - int cpu) + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) { struct sched_domain *sd = tl->init(tl, cpu); if (!sd) @@ -5985,7 +5984,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = NULL; for (tl = sched_domain_topology; tl->init; tl++) { - sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) -- cgit v1.2.3 From 22da956953f371c1ee7a578c31ed8c5702cb52b1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 4 Jun 2013 15:41:15 +0530 Subject: sched: Optimize build_sched_domains() for saving first SD node for a cpu We are saving first scheduling domain for a cpu in build_sched_domains() by iterating over the nested sd->child list. We don't actually need to do it this way. tl will be equal to sched_domain_topology for the first iteration and so we can set *per_cpu_ptr(d.sd, i) based on that. So, save pointer to first SD while running the iteration loop over tl's. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/fc473527cbc4dfa0b8eeef2a59db74684eb59a83.1370436120.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 342e74419f8f..137dcc03f66d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5985,16 +5985,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, sd = NULL; for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } - - while (sd->child) - sd = sd->child; - - *per_cpu_ptr(d.sd, i) = sd; } /* Build the groups for the domains */ -- cgit v1.2.3 From 1c6321694074163b5863c13d71c19ca953a3fb08 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:18 +0530 Subject: sched: Don't initialize alloc_state in build_sched_domains() alloc_state will be overwritten by __visit_domain_allocation_hell() and so we don't actually need to initialize alloc_state. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/df57734a075cc5ad130e1ae498702e24f2529ab8.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 137dcc03f66d..3de62649869b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5969,7 +5969,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state = sa_none; + enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; -- cgit v1.2.3 From c75e01288ce9c9a6b7beb6b23c07d2e4d1db8c84 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:19 +0530 Subject: sched: Don't set sd->child to NULL when it is already NULL Memory for sd is allocated with kzalloc_node() which will initialize its fields with zero. In build_sched_domain() we are setting sd->child to child even if child is NULL, which isn't required. Lets do it only if child isn't NULL. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/f4753a1730051341003ad2ad29a3229c7356678e.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3de62649869b..88c2c0ee5a52 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5955,8 +5955,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + sd->child = child; } - sd->child = child; set_domain_attribute(sd, attr); return sd; -- cgit v1.2.3 From 27723a68caf05381b0b0bc6e127da2c9e7bcb775 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Jun 2013 16:27:20 +0530 Subject: sched: Create for_each_sd_topology() For loop for traversing sched_domain_topology was used at multiple placed in core.c. This patch removes code redundancy by creating for_each_sd_topology(). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/e0e04542f54e9464bd9da54f5ccfe62ec6c4c0bc.1370861520.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88c2c0ee5a52..547b7d3ff893 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5565,6 +5565,9 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->init; tl++) + #ifdef CONFIG_NUMA static int sched_domains_numa_levels; @@ -5862,7 +5865,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; sdd->sd = alloc_percpu(struct sched_domain *); @@ -5915,7 +5918,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { @@ -5983,7 +5986,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; -- cgit v1.2.3 From 0936629f01bb1b11772db8c36be421365238cbec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:43 +0530 Subject: sched: Use cached value of span instead of calling sched_domain_span() In the beginning of build_sched_groups() we called sched_domain_span() and cached its return value in span. Few statements later we are calling it again to get the same pointer. Lets use the cached value instead as it hasn't changed in between. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/834ecd507071ad88aff039352dbc7e063dd996a7.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 547b7d3ff893..3388387e1330 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5347,7 +5347,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); - if (cpu != cpumask_first(sched_domain_span(sd))) + if (cpu != cpumask_first(span)) return 0; lockdep_assert_held(&sched_domains_mutex); -- cgit v1.2.3 From cd08e9234c987766ad077bba80eb5a07d0855525 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:44 +0530 Subject: sched: Fix memory leakage in build_sched_groups() In build_sched_groups() we don't need to call get_group() for cpus which are already covered in previous iterations. Calling get_group() would mark the group used and eventually leak it since we wouldn't connect it and not find it again to free it. This will happen only in cases where sg->cpumask contained more than one cpu (For any topology level). This patch would free sg's memory for all cpus leaving the group leader as the group isn't marked used now. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/7a61e955abdcbb1dfa9fe493f11a5ec53a11ddd3.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3388387e1330..014c97f00732 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5357,12 +5357,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) for_each_cpu(i, span) { struct sched_group *sg; - int group = get_group(i, sdd, &sg); - int j; + int group, j; if (cpumask_test_cpu(i, covered)) continue; + group = get_group(i, sdd, &sg); cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); -- cgit v1.2.3 From 94c95ba69f31e435416988ddb223c92e5b0e9e83 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Jun 2013 16:32:45 +0530 Subject: sched: Remove WARN_ON(!sd) from init_sched_groups_power() sd can't be NULL in init_sched_groups_power() and so checking it for NULL isn't useful. In case it is required, then also we need to rearrange the code a bit as we already accessed invalid pointer sd to get sg: sg = sd->groups. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2bbe633cd74b431c05253a8ce61fdfd5066a531b.1370948150.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 014c97f00732..21b1403a10a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5400,7 +5400,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; - WARN_ON(!sd || !sg); + WARN_ON(!sg); do { sg->group_weight = cpumask_weight(sched_group_cpus(sg)); -- cgit v1.2.3 From be7002e6c613d22976f2b8d4bae6121a5fc0433a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Jun 2013 11:55:36 -0700 Subject: sched: Don't mix use of typedef ctl_table and struct ctl_table Just use struct ctl_table. Signed-off-by: Joe Perches Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371063336.2069.22.camel@joe-AO722 Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 21b1403a10a2..ceeaf0f45be0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4533,7 +4533,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return table; } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) { struct ctl_table *entry, *table; struct sched_domain *sd; -- cgit v1.2.3 From e12d0271774fea9fddf1e2a7952a0bffb2ee8e8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 May 2013 17:12:28 -0400 Subject: nohz: Warn if the machine can not perform nohz_full If the user configures NO_HZ_FULL and defines nohz_full=XXX on the kernel command line, or enables NO_HZ_FULL_ALL, but nohz fails due to the machine having a unstable clock, warn about it. We do not want users thinking that they are getting the benefit of nohz when their machine can not support it. Signed-off-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..d87d22cb9bf2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -178,6 +178,11 @@ static bool can_stop_full_tick(void) */ if (!sched_clock_stable) { trace_tick_stop(0, "unstable sched clock\n"); + /* + * Don't allow the user to think they can get + * full NO_HZ with this machine. + */ + WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif -- cgit v1.2.3 From b8900bc0217fac8e68085997bee2f05e6db931a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 6 Jun 2013 15:42:53 +0200 Subject: watchdog: Register / unregister watchdog kthreads on sysctl control The user activation/deactivation of the watchdog through boot parameters or systcl is currently implemented with a dance involving kthreads parking and unparking methods: the threads are unconditionally registered on boot and they park as soon as the user want the watchdog to be disabled. This method involves a few noisy details to handle though: the watchdog kthreads may be unparked anytime due to hotplug operations, after which the watchdog internals have to decide to park again if it is user-disabled. As a result the setup() and unpark() methods need to be able to request a reparking. This is not currently supported in the kthread infrastructure so this piece of the watchdog code only works halfway. Besides, unparking/reparking the watchdog kthreads consume unnecessary cputime on hotplug operations when those could be simply ignored in the first place. As suggested by Srivatsa, let's instead only register the watchdog threads when they are needed. This way we don't need to think about hotplug operations and we don't burden the CPU onlining when the watchdog is simply disabled. Suggested-by: Srivatsa S. Bhat Signed-off-by: Frederic Weisbecker Cc: Srivatsa S. Bhat Cc: Anish Singh Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Don Zickus --- kernel/watchdog.c | 87 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..52c9a9b91bdd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,7 +31,7 @@ int watchdog_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled; +static int __read_mostly watchdog_disabled = 1; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - if (!watchdog_enabled) { - kthread_park(current); - return; - } - /* Enable the perf event */ watchdog_nmi_enable(cpu); @@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); } +static void watchdog_cleanup(unsigned int cpu, bool online) +{ + watchdog_disable(cpu); +} + static int watchdog_should_run(unsigned int cpu) { return __this_cpu_read(hrtimer_interrupts) != @@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL -static void watchdog_enable_all_cpus(void) +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .cleanup = watchdog_cleanup, + .park = watchdog_disable, + .unpark = watchdog_enable, +}; + +static int watchdog_enable_all_cpus(void) { - unsigned int cpu; + int err = 0; if (watchdog_disabled) { - watchdog_disabled = 0; - for_each_online_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + err = smpboot_register_percpu_thread(&watchdog_threads); + if (err) + pr_err("Failed to create watchdog threads, disabled\n"); + else + watchdog_disabled = 0; } + + return err; } +/* prepare/enable/disable routines */ +/* sysctl functions */ +#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { - unsigned int cpu; - if (!watchdog_disabled) { watchdog_disabled = 1; - for_each_online_cpu(cpu) - kthread_park(per_cpu(softlockup_watchdog, cpu)); + smpboot_unregister_percpu_thread(&watchdog_threads); } } @@ -507,14 +519,14 @@ static void watchdog_disable_all_cpus(void) int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; + int err, old_thresh, old_enabled; - if (watchdog_disabled < 0) - return -ENODEV; + old_thresh = ACCESS_ONCE(watchdog_thresh); + old_enabled = ACCESS_ONCE(watchdog_enabled); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err || !write) + return err; set_sample_period(); /* @@ -523,29 +535,24 @@ int proc_dowatchdog(struct ctl_table *table, int write, * watchdog_*_all_cpus() function takes care of this. */ if (watchdog_enabled && watchdog_thresh) - watchdog_enable_all_cpus(); + err = watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); - return ret; + /* Restore old values on failure */ + if (err) { + watchdog_thresh = old_thresh; + watchdog_enabled = old_enabled; + } + + return err; } #endif /* CONFIG_SYSCTL */ -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - void __init lockup_detector_init(void) { set_sample_period(); - if (smpboot_register_percpu_thread(&watchdog_threads)) { - pr_err("Failed to create watchdog threads, disabled\n"); - watchdog_disabled = -ENODEV; - } + + if (watchdog_enabled) + watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 1a891cf19cdfb645827969cc6aeaeebdefeb87b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Jun 2013 13:16:25 -0400 Subject: tracing: Add binary '&' filter for events There are some cases when filtering on a set flag of a field of a tracepoint is useful. But currently the only filtering commands for numbered fields is ==, !=, <, <=, >, >=. This does not help when you just want to trace if a specific flag is set. For example: > # sudo trace-cmd record -e brcmfmac:brcmf_dbg -f 'level & 0x40000' > disable all > enable brcmfmac:brcmf_dbg > path = /sys/kernel/debug/tracing/events/brcmfmac/brcmf_dbg/enable > (level & 0x40000) > ^ > parse_error: Invalid operator > When trying to trace brcmf_dbg when level has its 1 << 18 bit set, the filter fails to perform. By allowing a binary '&' operation, this gives the user the ability to test a bit. Note, a binary '|' is not added, as it doesn't make sense as fields must be compared to constants (for now), and ORing a constant will always return true. Link: http://lkml.kernel.org/r/1371057385.9844.261.camel@gandalf.local.home Suggested-by: Arend van Spriel Tested-by: Arend van Spriel Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1b653f7e1ca..0d883dc057d6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -44,6 +44,7 @@ enum filter_op_ids OP_LE, OP_GT, OP_GE, + OP_BAND, OP_NONE, OP_OPEN_PAREN, }; @@ -54,6 +55,7 @@ struct filter_op { int precedence; }; +/* Order must be the same as enum filter_op_ids above */ static struct filter_op filter_ops[] = { { OP_OR, "||", 1 }, { OP_AND, "&&", 2 }, @@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = { { OP_LE, "<=", 5 }, { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, + { OP_BAND, "&", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ case OP_GE: \ match = (*addr >= val); \ break; \ + case OP_BAND: \ + match = (*addr & val); \ + break; \ default: \ break; \ } \ -- cgit v1.2.3 From de7edd31457b626e54a0b2a7e8ff4d65492f01ad Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 14 Jun 2013 16:21:43 -0400 Subject: tracing: Disable tracing on warning Add a traceoff_on_warning option in both the kernel command line as well as a sysctl option. When set, any WARN*() function that is hit will cause the tracing_on variable to be cleared, which disables writing to the ring buffer. This is useful especially when tracing a bug with function tracing. When a warning is hit, the print caused by the warning can flood the trace with the functions that producing the output for the warning. This can make the resulting trace useless by either hiding where the bug happened, or worse, by overflowing the buffer and losing the trace of the bug totally. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/panic.c | 3 +++ kernel/sysctl.c | 7 +++++++ kernel/trace/trace.c | 17 +++++++++++++++++ 3 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..4cea6cc628ab 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -399,6 +400,8 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { + disable_trace_on_warning(); + printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..5b0f18c12800 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -600,6 +600,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_MODULES { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f4a09c12e0b..c4c9296b1916 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask; enum ftrace_dump_mode ftrace_dump_on_oops; +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + static int tracing_set_tracer(const char *buf); #define MAX_TRACER_SIZE 100 @@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); + static int __init boot_alloc_snapshot(char *str) { allocate_snapshot = true; @@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -562,6 +573,12 @@ void tracing_off(void) } EXPORT_SYMBOL_GPL(tracing_off); +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) + tracing_off(); +} + /** * tracing_is_on - show state of ring buffers enabled */ -- cgit v1.2.3 From 195a84d91e92ee3fe571a2086a6db7e17bf5bc7c Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Fri, 14 Jun 2013 10:10:38 +0800 Subject: tracing/kprobes: Remove unnecessary checking of trace_probe_is_enabled Since tp->flags assignment was moved into function enable_trace_probe(), there is no need to use trace_probe_is_enabled to check flags in the same function. Remove the unnecessary checking. Link: http://lkml.kernel.org/r/51BA7B9E.3040807@huawei.com Acked-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9f46e98ba8f2..f2374172ba7b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -240,8 +240,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } else tp->flags |= TP_FLAG_PROFILE; - if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && - !trace_probe_has_gone(tp)) { + if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else -- cgit v1.2.3 From 52d85d763086594f139bf7d3a5641abeb91d9f57 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 12 Jun 2013 12:03:18 +0200 Subject: ftrace: Fix stddev calculation in function profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When FUNCTION_GRAPH_TRACER is enabled, ftrace can profile kernel functions and print basic statistics about them. Unfortunately, running stddev calculation is wrong. This patch corrects it implementing Welford’s method: s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) . Link: http://lkml.kernel.org/r/1371031398-24048-1-git-send-email-juri.lelli@gmail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 800a8a2fbddb..26e19105cdcc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -641,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v) if (rec->counter <= 1) stddev = 0; else { - stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + /* * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, (rec->counter - 1) * 1000); + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); -- cgit v1.2.3 From 3c00ea82c724fab0b98f15428a804cb45eb9ad38 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 May 2013 20:45:15 +0200 Subject: watchdog: Rename confusing state variable We have two very conflicting state variable names in the watchdog: * watchdog_enabled: This one reflects the user interface. It's set to 1 by default and can be overriden with boot options or sysctl/procfs interface. * watchdog_disabled: This is the internal toggle state that tells if watchdog threads, timers and NMI events are currently running or not. This state mostly depends on the user settings. It's a convenient state latch. Now we really need to find clearer names because those are just too confusing to encourage deep review. watchdog_enabled now becomes watchdog_user_enabled to reflect its purpose as an interface. watchdog_disabled becomes watchdog_running to suggest its role as a pure internal state. Signed-off-by: Frederic Weisbecker Cc: Srivatsa S. Bhat Cc: Anish Singh Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Don Zickus --- kernel/sysctl.c | 4 ++-- kernel/watchdog.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..b0805652c4ff 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -801,7 +801,7 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, @@ -828,7 +828,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 52c9a9b91bdd..51c4f34d258e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,9 +29,9 @@ #include #include -int watchdog_enabled = 1; +int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled = 1; +static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); @@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_enabled) { + if (watchdog_user_enabled) { unsigned cpu; for_each_present_cpu(cpu) { @@ -490,12 +490,12 @@ static int watchdog_enable_all_cpus(void) { int err = 0; - if (watchdog_disabled) { + if (!watchdog_running) { err = smpboot_register_percpu_thread(&watchdog_threads); if (err) pr_err("Failed to create watchdog threads, disabled\n"); else - watchdog_disabled = 0; + watchdog_running = 1; } return err; @@ -506,8 +506,8 @@ static int watchdog_enable_all_cpus(void) #ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { - if (!watchdog_disabled) { - watchdog_disabled = 1; + if (watchdog_running) { + watchdog_running = 0; smpboot_unregister_percpu_thread(&watchdog_threads); } } @@ -522,7 +522,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, int err, old_thresh, old_enabled; old_thresh = ACCESS_ONCE(watchdog_thresh); - old_enabled = ACCESS_ONCE(watchdog_enabled); + old_enabled = ACCESS_ONCE(watchdog_user_enabled); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err || !write) @@ -531,10 +531,10 @@ int proc_dowatchdog(struct ctl_table *table, int write, set_sample_period(); /* * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_disabled' variable check in + * disabled. The 'watchdog_running' variable check in * watchdog_*_all_cpus() function takes care of this. */ - if (watchdog_enabled && watchdog_thresh) + if (watchdog_user_enabled && watchdog_thresh) err = watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); @@ -542,7 +542,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, /* Restore old values on failure */ if (err) { watchdog_thresh = old_thresh; - watchdog_enabled = old_enabled; + watchdog_user_enabled = old_enabled; } return err; @@ -553,6 +553,6 @@ void __init lockup_detector_init(void) { set_sample_period(); - if (watchdog_enabled) + if (watchdog_user_enabled) watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 940be35ac0139530d7554aa2352a8388e3d4adca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Jun 2013 13:35:42 +0200 Subject: watchdog: Boot-disable by default on full dynticks When the watchdog runs, it prevents the full dynticks CPUs from stopping their tick because the hard lockup detector uses perf events internally, which in turn rely on the periodic tick. Since this is a rather confusing behaviour that is not easy to track down and identify for those who want to test CONFIG_NO_HZ_FULL, let's default disable the watchdog on boot time when full dynticks is enabled. The user can still enable it later on runtime using proc or sysctl. Reported-by: Steven Rostedt Suggested-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Li Zhong Cc: Don Zickus Cc: Srivatsa S. Bhat Cc: Anish Singh --- kernel/watchdog.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51c4f34d258e..1241d8c91d5e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -553,6 +553,14 @@ void __init lockup_detector_init(void) { set_sample_period(); +#ifdef CONFIG_NO_HZ_FULL + if (watchdog_user_enabled) { + watchdog_user_enabled = 0; + pr_warning("Disabled lockup detectors by default for full dynticks\n"); + pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); + } +#endif + if (watchdog_user_enabled) watchdog_enable_all_cpus(); } -- cgit v1.2.3 From 5b8621a68fdcd2baf1d3b413726f913a5254d46a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Jun 2013 13:47:31 +0200 Subject: nohz: Remove obsolete check for full dynticks CPUs to be RCU nocbs Building full dynticks now implies that all CPUs are forced into RCU nocb mode through CONFIG_RCU_NOCB_CPU_ALL. The dynamic check has become useless. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Li Zhong Cc: Borislav Petkov --- kernel/time/tick-sched.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d87d22cb9bf2..b15750139260 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -351,16 +351,6 @@ void __init tick_nohz_init(void) } cpu_notifier(tick_nohz_cpu_down_callback, 0); - - /* Make sure full dynticks CPU are also RCU nocbs */ - for_each_cpu(cpu, nohz_full_mask) { - if (!rcu_is_nocb_cpu(cpu)) { - pr_warning("NO_HZ: CPU %d is not RCU nocb: " - "cleared from nohz_full range", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); - } - } - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } -- cgit v1.2.3 From e1ebe86203e6532eb5a0ae8f26ccae47aca548ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:11 +0200 Subject: hw_breakpoint: Simplify list/idx mess in toggle_bp_slot() paths The enable/disable logic in toggle_bp_slot() is not symmetrical and imho very confusing. "old_count" in toggle_bp_task_slot() is actually new_count because this bp was already removed from the list. Change toggle_bp_slot() to always call list_add/list_del after toggle_bp_task_slot(). This way old_idx is task_bp_pinned() and this entry should be decremented, new_idx is +/-weight and we need to increment this element. The code/logic looks obvious. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/20130620155011.GA6330@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index ef8ebe560949..dee0148dcf54 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -185,26 +185,20 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, enum bp_type_idx type, int weight) { - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(cpu, bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } + /* tsk_pinned[n-1] is the number of tasks having n>0 breakpoints */ + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + int old_idx, new_idx; + + old_idx = task_bp_pinned(cpu, bp, type) - 1; + if (enable) + new_idx = old_idx + weight; + else + new_idx = old_idx - weight; + + if (old_idx >= 0) + tsk_pinned[old_idx]--; + if (new_idx >= 0) + tsk_pinned[new_idx]++; } /* @@ -228,10 +222,6 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, } /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - if (cpu >= 0) { toggle_bp_task_slot(bp, cpu, enable, type, weight); } else { @@ -241,6 +231,8 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (enable) list_add_tail(&bp->hw.bp_list, &bp_task_head); + else + list_del(&bp->hw.bp_list); } /* -- cgit v1.2.3 From 7ab71f3244e9f970c29566c5a67e13d1fa38c387 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:13 +0200 Subject: hw_breakpoint: Simplify the "weight" usage in toggle_bp_slot() paths Change toggle_bp_slot() to make "weight" negative if !enable. This way we can always use "+ weight" without additional "if (enable)" check and toggle_bp_task_slot() no longer needs this arg. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/20130620155013.GA6337@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index dee0148dcf54..5cd4f6d9652c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -182,7 +182,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { /* tsk_pinned[n-1] is the number of tasks having n>0 breakpoints */ @@ -190,10 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, int old_idx, new_idx; old_idx = task_bp_pinned(cpu, bp, type) - 1; - if (enable) - new_idx = old_idx + weight; - else - new_idx = old_idx - weight; + new_idx = old_idx + weight; if (old_idx >= 0) tsk_pinned[old_idx]--; @@ -211,22 +208,21 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int cpu = bp->cpu; struct task_struct *tsk = bp->hw.bp_target; + if (!enable) + weight = -weight; + /* Pinned counter cpu profiling */ if (!tsk) { - - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + per_cpu(nr_cpu_bp_pinned[type], cpu) += weight; return; } /* Pinned counter task profiling */ if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); + toggle_bp_task_slot(bp, cpu, type, weight); } else { for_each_possible_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); + toggle_bp_task_slot(bp, cpu, type, weight); } if (enable) -- cgit v1.2.3 From 1c10adbb929936316f71df089ace699fce037e24 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:15 +0200 Subject: hw_breakpoint: Introduce cpumask_of_bp() Add the trivial helper which simply returns cpumask_of() or cpu_possible_mask depending on bp->cpu. Change fetch_bp_busy_slots() and toggle_bp_slot() to always do for_each_cpu(cpumask_of_bp) to simplify the code and avoid the code duplication. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/20130620155015.GA6340@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5cd4f6d9652c..9c71445328af 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -127,6 +127,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) return count; } +static const struct cpumask *cpumask_of_bp(struct perf_event *bp) +{ + if (bp->cpu >= 0) + return cpumask_of(bp->cpu); + return cpu_possible_mask; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). @@ -135,25 +142,13 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(cpu, bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; - for_each_possible_cpu(cpu) { - unsigned int nr; + for_each_cpu(cpu, cpumask) { + unsigned int nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) + if (!bp->hw.bp_target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -205,25 +200,21 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; if (!enable) weight = -weight; /* Pinned counter cpu profiling */ - if (!tsk) { - per_cpu(nr_cpu_bp_pinned[type], cpu) += weight; + if (!bp->hw.bp_target) { + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; return; } /* Pinned counter task profiling */ - if (cpu >= 0) { + for_each_cpu(cpu, cpumask) toggle_bp_task_slot(bp, cpu, type, weight); - } else { - for_each_possible_cpu(cpu) - toggle_bp_task_slot(bp, cpu, type, weight); - } if (enable) list_add_tail(&bp->hw.bp_list, &bp_task_head); -- cgit v1.2.3 From e12cbc10cb27fcbe51b5f68e2015138dc451a2eb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:18 +0200 Subject: hw_breakpoint: Simplify *register_wide_hw_breakpoint() 1. register_wide_hw_breakpoint() can use unregister_ if failure, no need to duplicate the code. 2. "struct perf_event **pevent" adds the unnecesary lever of indirection and complication, use per_cpu(*cpu_events, cpu). Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/20130620155018.GA6347@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9c71445328af..38418f786f36 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -497,8 +497,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; + struct perf_event * __percpu *cpu_events, *bp; + long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); @@ -507,31 +507,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); - - *pevent = bp; - if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto fail; + break; } - } - put_online_cpus(); - - return cpu_events; -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); + per_cpu(*cpu_events, cpu) = bp; } put_online_cpus(); - free_percpu(cpu_events); + if (likely(!err)) + return cpu_events; + + unregister_wide_hw_breakpoint(cpu_events); return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -543,12 +533,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; - struct perf_event **pevent; - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } + for_each_possible_cpu(cpu) + unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); + free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); -- cgit v1.2.3 From bde96030f438b5eb6fb74f3bdd06d9f68bb3ba00 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:20 +0200 Subject: hw_breakpoint: Introduce "struct bp_cpuinfo" This patch simply moves all per-cpu variables into the new single per-cpu "struct bp_cpuinfo". To me this looks more logical and clean, but this can also simplify the further potential changes. In particular, I do not think this memory should be per-cpu, it is never used "locally". After this change it is trivial to turn it into, say, bootmem[nr_cpu_ids]. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Link: http://lkml.kernel.org/r/20130620155020.GA6350@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 69 ++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 38418f786f36..1559fb0b9296 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,23 +46,26 @@ #include #include - - /* * Constraints data */ +struct bp_cpuinfo { + /* Number of pinned cpu breakpoints in a cpu */ + unsigned int cpu_pinned; + /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ + unsigned int *tsk_pinned; + /* Number of non-pinned cpu/task breakpoints in a cpu */ + unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ +}; -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - +static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) +{ + return per_cpu_ptr(bp_cpuinfo + type, cpu); +} + /* Keep track of the breakpoints attached to tasks */ static LIST_HEAD(bp_task_head); @@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -146,8 +149,10 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int cpu; for_each_cpu(cpu, cpumask) { - unsigned int nr = per_cpu(nr_cpu_bp_pinned[type], cpu); + struct bp_cpuinfo *info = get_bp_info(cpu, type); + int nr; + nr = info->cpu_pinned; if (!bp->hw.bp_target) nr += max_task_bp_pinned(cpu, type); else @@ -156,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible[type], cpu); - + nr = info->flexible; if (nr > slots->flexible) slots->flexible = nr; } @@ -180,8 +184,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - /* tsk_pinned[n-1] is the number of tasks having n>0 breakpoints */ - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int old_idx, new_idx; old_idx = task_bp_pinned(cpu, bp, type) - 1; @@ -208,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.bp_target) { - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } @@ -240,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -251,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to every cpus, check: * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -263,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp) { @@ -622,7 +625,6 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - unsigned int **task_bp_pinned; int cpu, err_cpu; int i; @@ -631,10 +633,11 @@ int __init init_hw_breakpoint(void) for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), + GFP_KERNEL); + if (!info->tsk_pinned) goto err_alloc; } } @@ -648,7 +651,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); + kfree(get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } -- cgit v1.2.3 From d24c2a4f919d17bd1ae4f4010a38ab07ece99cf7 Mon Sep 17 00:00:00 2001 From: Sahara Date: Thu, 20 Jun 2013 11:33:57 +0900 Subject: PM / QoS: correct the valid range of pm_qos_class The valid start index for pm_qos_array is not 0, but PM_QOS_CPU_DMA_LATENCY. There is a null_pm_qos at index 0 of pm_qos_array. However, null_pm_qos is not created as misc device so that inclusion of 0 index for checking pm_qos_class especially for file operations is not proper here. [rjw: Changelog, a bit] Signed-off-by: Sahara Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 587dddeebf15..f2f5f6e22a3c 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -477,7 +477,7 @@ static int find_pm_qos_object_by_minor(int minor) { int pm_qos_class; - for (pm_qos_class = 0; + for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { if (minor == pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) @@ -491,7 +491,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) long pm_qos_class; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= 0) { + if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; @@ -584,7 +584,7 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); - for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i]); if (ret < 0) { printk(KERN_ERR "pm_qos_param: %s setup failed\n", -- cgit v1.2.3 From bb177fedd348c92c2bea6adc9a2163ebff15272e Mon Sep 17 00:00:00 2001 From: Julius Werner Date: Wed, 12 Jun 2013 12:55:22 -0700 Subject: PM / Sleep: Print last wakeup source on failed wakeup_count write Commit a938da06 introduced a useful little log message to tell users/debuggers which wakeup source aborted a suspend. However, this message is only printed if the abort happens during the in-kernel suspend path (after writing /sys/power/state). The full specification of the /sys/power/wakeup_count facility allows user-space power managers to double-check if wakeups have already happened before it actually tries to suspend (e.g. while it was running user-space pre-suspend hooks), by writing the last known wakeup_count value to /sys/power/wakeup_count. This patch changes the sysfs handler for that node to also print said log message if that write fails, so that we can figure out the offending wakeup source for both kinds of suspend aborts. Signed-off-by: Julius Werner Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d77663bfedeb..0828070d38b4 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj, if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; + else + pm_print_active_wakeup_sources(); } out: -- cgit v1.2.3 From 14c63f17b1fde5a575a28e96547a22b451c71fb5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 21 Jun 2013 08:51:36 -0700 Subject: perf: Drop sample rate when sampling is too slow This patch keeps track of how long perf's NMI handler is taking, and also calculates how many samples perf can take a second. If the sample length times the expected max number of samples exceeds a configurable threshold, it drops the sample rate. This way, we don't have a runaway sampling process eating up the CPU. This patch can tend to drop the sample rate down to level where perf doesn't work very well. *BUT* the alternative is that my system hangs because it spends all of its time handling NMIs. I'll take a busted performance tool over an entire system that's busted and undebuggable any day. BTW, my suspicion is that there's still an underlying bug here. Using the HPET instead of the TSC is definitely a contributing factor, but I suspect there are some other things going on. But, I can't go dig down on a bug like that with my machine hanging all the time. Signed-off-by: Dave Hansen Acked-by: Peter Zijlstra Cc: paulus@samba.org Cc: acme@ghostprotocols.net Cc: Dave Hansen [ Prettified it a bit. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 9 +++++ 2 files changed, 97 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c8920783317..1db3af933704 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -165,10 +165,26 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' /* * max perf event sample rate */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +#define DEFAULT_MAX_SAMPLE_RATE 100000 +#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) +#define DEFAULT_CPU_TIME_MAX_PERCENT 25 + +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; + +static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; + +static atomic_t perf_sample_allowed_ns __read_mostly = + ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); + +void update_perf_cpu_limits(void) +{ + u64 tmp = perf_sample_period_ns; + + tmp *= sysctl_perf_cpu_time_max_percent; + tmp = do_div(tmp, 100); + atomic_set(&perf_sample_allowed_ns, tmp); +} static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -182,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + update_perf_cpu_limits(); return 0; } +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; + +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + update_perf_cpu_limits(); + + return 0; +} + +/* + * perf samples are done in some very critical code paths (NMIs). + * If they take too much CPU time, the system can lock up and not + * get any real work done. This will drop the sample rate when + * we detect that events are taking too long. + */ +#define NR_ACCUMULATED_SAMPLES 128 +DEFINE_PER_CPU(u64, running_sample_length); + +void perf_sample_event_took(u64 sample_len_ns) +{ + u64 avg_local_sample_len; + u64 local_samples_len = __get_cpu_var(running_sample_length); + + if (atomic_read(&perf_sample_allowed_ns) == 0) + return; + + /* decay the counter by 1 average sample */ + local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; + local_samples_len += sample_len_ns; + __get_cpu_var(running_sample_length) = local_samples_len; + + /* + * note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * from having to maintain a count. + */ + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + return; + + if (max_samples_per_tick <= 1) + return; + + max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); + sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + + printk_ratelimited(KERN_WARNING + "perf samples too long (%lld > %d), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, + atomic_read(&perf_sample_allowed_ns), + sysctl_perf_event_sample_rate); + + update_perf_cpu_limits(); +} + static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0a1f99907f3..4ce13c3cedb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1043,6 +1043,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_proc_update_handler, }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_KMEMCHECK { -- cgit v1.2.3 From 247e9ee034b0448a585afa16e292cbb9dc0aef68 Mon Sep 17 00:00:00 2001 From: Sahara Date: Fri, 21 Jun 2013 11:12:28 +0900 Subject: PM / QoS: Add pm_qos_update_target/flags tracepoints This patch adds tracepoints to pm_qos_update_target and pm_qos_update_flags. It's useful for checking pm qos action, previous value and current value. Signed-off-by: Sahara Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index f2f5f6e22a3c..4fb8d1427938 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -44,6 +44,7 @@ #include #include +#include /* * locking rule: all changes to constraints or notifiers lists @@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); + trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value != curr_value) { blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, @@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); + trace_pm_qos_update_flags(action, prev_value, curr_value); return prev_value != curr_value; } -- cgit v1.2.3 From ae8822b842e229fa4459fca2d979b630d812311d Mon Sep 17 00:00:00 2001 From: Sahara Date: Fri, 21 Jun 2013 11:12:29 +0900 Subject: PM / QoS: Add pm_qos_request tracepoints Adds tracepoints to pm_qos_add_request, pm_qos_update_request, pm_qos_remove_request, and pm_qos_update_request_timeout. It's useful for checking pm_qos_class, value, and timeout_us. Signed-off-by: Sahara Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 4fb8d1427938..06fe28589e9c 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -336,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req, } req->pm_qos_class = pm_qos_class; INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); + trace_pm_qos_add_request(pm_qos_class, value); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, &req->node, PM_QOS_ADD_REQ, value); } @@ -364,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req, cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request(req->pm_qos_class, new_value); if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, @@ -390,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request_timeout(req->pm_qos_class, + new_value, timeout_us); if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, @@ -419,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); -- cgit v1.2.3 From ddaf144c61da45ae5c49ae38556c3ac4524f9318 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 01:06:02 +0100 Subject: irqdomain: Refactor irq_domain_associate_many() Originally, irq_domain_associate_many() was designed to unwind the mapped irqs on a failure of any individual association. However, that proved to be a problem with certain IRQ controllers. Some of them only support a subset of irqs, and will fail when attempting to map a reserved IRQ. In those cases we want to map as many IRQs as possible, so instead it is better for irq_domain_associate_many() to make a best-effort attempt to map irqs, but not fail if any or all of them don't succeed. If a caller really cares about how many irqs got associated, then it should instead go back and check that all of the irqs is cares about were mapped. The original design open-coded the individual association code into the body of irq_domain_associate_many(), but with no longer needing to unwind associations, the code becomes simpler to split out irq_domain_associate() to contain the bulk of the logic, and irq_domain_associate_many() to be a simple loop wrapper. This patch also adds a new error check to the associate path to make sure it isn't called for an irq larger than the controller can handle, and adds locking so that the irq_domain_mutex is held while setting up a new association. v3: Fixup missing change to irq_domain_add_tree() v2: Fixup x86 warning. irq_domain_associate_many() no longer returns an error code, but reports errors to the printk log directly. In the majority of cases we don't actually want to fail if there is a problem, but rather log it and still try to boot the system. Signed-off-by: Grant Likely irqdomain: Fix flubbed irq_domain_associate_many refactoring commit d39046ec72, "irqdomain: Refactor irq_domain_associate_many()" was missing the following hunk which causes a boot failure on anything using irq_domain_add_tree() to allocate an irq domain. Signed-off-by: Grant Likely Cc: Michael Neuling Cc: Benjamin Herrenschmidt , Cc: Thomas Gleixner , Cc: Stephen Rothwell --- kernel/irq/irqdomain.c | 185 ++++++++++++++++++++++++------------------------- 1 file changed, 91 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 280b8047d8db..80e92492c77b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -35,8 +35,8 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct device_node *of_node, - int size, int direct_max, +struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, + irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) { @@ -52,6 +52,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -126,7 +127,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, size, 0, ops, host_data); + domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); if (!domain) return NULL; @@ -139,7 +140,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); } - WARN_ON(irq_domain_associate_many(domain, first_irq, 0, size)); + irq_domain_associate_many(domain, first_irq, 0, size); } return domain; @@ -170,11 +171,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, first_hwirq + size, 0, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, + first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; - WARN_ON(irq_domain_associate_many(domain, first_irq, first_hwirq, size)); + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } @@ -228,109 +230,109 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate_many(struct irq_domain *domain, - unsigned int irq_base, int count) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { - /* - * disassociate in reverse order; - * not strictly necessary, but nice for unwinding - */ - while (count--) { - int irq = irq_base + count; - struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq; + struct irq_data *irq_data = irq_get_irq_data(irq); + irq_hw_number_t hwirq; - if (WARN_ON(!irq_data || irq_data->domain != domain)) - continue; + if (WARN(!irq_data || irq_data->domain != domain, + "virq%i doesn't exist; cannot disassociate\n", irq)) + return; - hwirq = irq_data->hwirq; - irq_set_status_flags(irq, IRQ_NOREQUEST); + hwirq = irq_data->hwirq; + irq_set_status_flags(irq, IRQ_NOREQUEST); - /* remove chip and handler */ - irq_set_chip_and_handler(irq, NULL, NULL); + /* remove chip and handler */ + irq_set_chip_and_handler(irq, NULL, NULL); - /* Make sure it's completed */ - synchronize_irq(irq); + /* Make sure it's completed */ + synchronize_irq(irq); - /* Tell the PIC about it */ - if (domain->ops->unmap) - domain->ops->unmap(domain, irq); - smp_mb(); + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, irq); + smp_mb(); - irq_data->domain = NULL; - irq_data->hwirq = 0; + irq_data->domain = NULL; + irq_data->hwirq = 0; - /* Clear reverse map for this hwirq */ - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = 0; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - } + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); } } -int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, - irq_hw_number_t hwirq_base, int count) +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - unsigned int virq = irq_base; - irq_hw_number_t hwirq = hwirq_base; - int i, ret; + struct irq_data *irq_data = irq_get_irq_data(virq); + int ret; - pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + if (WARN(hwirq >= domain->hwirq_max, + "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) + return -EINVAL; + if (WARN(!irq_data, "error: virq%i is not allocated", virq)) + return -EINVAL; + if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) + return -EINVAL; - for (i = 0; i < count; i++) { - struct irq_data *irq_data = irq_get_irq_data(virq + i); - - if (WARN(!irq_data, "error: irq_desc not allocated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - if (WARN(irq_data->domain, "error: irq_desc already associated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - }; - - for (i = 0; i < count; i++, virq++, hwirq++) { - struct irq_data *irq_data = irq_get_irq_data(virq); - - irq_data->hwirq = hwirq; - irq_data->domain = domain; - if (domain->ops->map) { - ret = domain->ops->map(domain, virq, hwirq); - if (ret != 0) { - /* - * If map() returns -EPERM, this interrupt is protected - * by the firmware or some other service and shall not - * be mapped. Don't bother telling the user about it. - */ - if (ret != -EPERM) { - pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", - domain->name, hwirq, virq, ret); - } - irq_data->domain = NULL; - irq_data->hwirq = 0; - continue; + mutex_lock(&irq_domain_mutex); + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map) { + ret = domain->ops->map(domain, virq, hwirq); + if (ret != 0) { + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. Don't bother telling the user about it. + */ + if (ret != -EPERM) { + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + domain->name, hwirq, virq, ret); } - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && irq_data->chip) - domain->name = irq_data->chip->name; + irq_data->domain = NULL; + irq_data->hwirq = 0; + mutex_unlock(&irq_domain_mutex); + return ret; } - if (hwirq < domain->revmap_size) { - domain->linear_revmap[hwirq] = virq; - } else { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; + } - irq_clear_status_flags(virq, IRQ_NOREQUEST); + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); } + mutex_unlock(&irq_domain_mutex); + + irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } +EXPORT_SYMBOL_GPL(irq_domain_associate); + +void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + int i; + + pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, + of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + + for (i = 0; i < count; i++) { + irq_domain_associate(domain, irq_base + i, hwirq_base + i); + } +} EXPORT_SYMBOL_GPL(irq_domain_associate_many); /** @@ -460,12 +462,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, if (unlikely(ret < 0)) return ret; - ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); - if (unlikely(ret < 0)) { - irq_free_descs(irq_base, count); - return ret; - } - + irq_domain_associate_many(domain, irq_base, hwirq_base, count); return 0; } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); @@ -535,7 +532,7 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate_many(domain, virq, 1); + irq_domain_disassociate(domain, virq); irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit v1.2.3 From 56a3d5ac774d054ece9373277a861338a468a294 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 01:09:33 +0100 Subject: irqdomain: remove irq_domain_generate_simple() Nobody calls it; remove the function Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 80e92492c77b..e47b35671384 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -741,18 +741,3 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#ifdef CONFIG_OF_IRQ -void irq_domain_generate_simple(const struct of_device_id *match, - u64 phys_base, unsigned int irq_start) -{ - struct device_node *node; - pr_debug("looking for phys_base=%llx, irq_start=%i\n", - (unsigned long long) phys_base, (int) irq_start); - node = of_find_matching_node_by_address(NULL, match, phys_base); - if (node) - irq_domain_add_legacy(node, 32, irq_start, 0, - &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif -- cgit v1.2.3 From d3dcb436f61593843af178d4a520c8c43c04d3fc Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 10 Jun 2013 12:19:17 +0100 Subject: irqdomain: make irq_linear_revmap() a fast path again Over the years, irq_linear_revmap() gained tests and checks to make sure callers were using it safely, which while important, also make it less of a fast path. After the irqdomain refactoring done recently, it is now possible to make irq_linear_revmap() a fast path again. This patch moves irq_linear_revmap() to the header file and makes it a static inline so that interrupt controller drivers using a linear mapping can decode the virq from a hwirq in just a couple of instructions. Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e47b35671384..836a0f7ec2a9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -559,35 +559,17 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return hwirq; } - return irq_linear_revmap(domain, hwirq); + /* Check if the hwirq is in the linear revmap. */ + if (hwirq < domain->revmap_size) + return domain->linear_revmap[hwirq]; + + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); -/** - * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path that can be called directly by irq controller code to - * save a handful of instructions. - */ -unsigned int irq_linear_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - struct irq_data *data; - - /* Check revmap bounds; complain if exceeded */ - if (hwirq >= domain->revmap_size) { - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_tree, hwirq); - rcu_read_unlock(); - return data ? data->irq : 0; - } - - return domain->linear_revmap[hwirq]; -} -EXPORT_SYMBOL_GPL(irq_linear_revmap); - #ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) { -- cgit v1.2.3 From c12d2f42a96d72cffa4d9335ca455a2243333c79 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 16:29:19 -0700 Subject: irqdomain: Include hwirq number in /proc/interrupts Add the hardware interrupt number to the output of /proc/interrupts. It is often important to have access to the hardware interrupt number because it identifies exactly how an interrupt signal is wired up to the interrupt controller. This is especially important when using irq_domains since irq numbers get dynamically allocated in that case, and have no relation to the actual hardware number. Note: This output is currently conditional on whether or not the irq_domain pointer is set; however hwirq could still be used without irq_domain. It may be worthwhile to always output the hwirq number regardless of the domain pointer. Signed-off-by: Grant Likely Tested-by: Olof Johansson Cc: Ben Herrenschmidt Cc: Thomas Gleixner --- kernel/irq/proc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 19ed5c425c3b..36f6ee181b0c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v) } else { seq_printf(p, " %8s", "None"); } + if (desc->irq_data.domain) + seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif -- cgit v1.2.3 From 798f0fd188be3656991c8745104b5ee045769a5f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 6 Jun 2013 19:20:27 +0800 Subject: irq: fix checkpatch error ERROR: space required before the open parenthesis '(' WARNING: Prefer pr_warn(... to pr_warning(... Just fix above 2 issue. Signed-off-by: Kefeng Wang Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 836a0f7ec2a9..13f265430c25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -396,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { - pr_warning("irq_create_mapping called for" - " NULL domain, hwirq=%lx\n", hwirq); - WARN_ON(1); + WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } pr_debug("-> using domain @%p\n", domain); @@ -489,8 +487,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, if (intsize > 0) return intspec[0]; #endif - pr_warning("no irq domain found for %s !\n", - of_node_full_name(controller)); + pr_warn("no irq domain found for %s !\n", + of_node_full_name(controller)); return 0; } -- cgit v1.2.3 From 6fff8314046276331314ae32cea34c6d11c440d2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 18 Jun 2013 15:08:33 +0100 Subject: genirq: Irqchip: document gcflags arg of irq_alloc_domain_generic_chips Commit 088f40b7b027dad6519712ff224a5798dd62a204 ("genirq: Generic chip: Add linear irq domain support") missed kerneldoc for the gcflags argument of irq_alloc_domain_generic_chips(). Add it now. Signed-off-by: James Hogan Acked-by: Grant Likely Link: http://lkml.kernel.org/r/1371564513-4327-1-git-send-email-james.hogan@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 95575d8d5392..a746a8f54dae 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -254,6 +254,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) * @handler: Default flow handler associated with these chips * @clr: IRQ_* bits to clear in the mapping function * @set: IRQ_* bits to set in the mapping function + * @gcflags: Generic chip specific setup flags */ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, -- cgit v1.2.3 From 70e5975d3a04be5479a28eec4a2fb10f98ad2785 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 13 Jun 2013 11:39:50 -0700 Subject: clockevents: Prefer CPU local devices over global devices On an SMP system with only one global clockevent and a dummy clockevent per CPU we run into problems. We want the dummy clockevents to be registered as the per CPU tick devices, but we can only achieve that if we register the dummy clockevents before the global clockevent or if we artificially inflate the rating of the dummy clockevents to be higher than the rating of the global clockevent. Failure to do so leads to boot hangs when the dummy timers are registered on all other CPUs besides the CPU that accepted the global clockevent as its tick device and there is no broadcast timer to poke the dummy devices. If we're registering multiple clockevents and one clockevent is global and the other is local to a particular CPU we should choose to use the local clockevent regardless of the rating of the device. This way, if the clockevent is a dummy it will take the tick device duty as long as there isn't a higher rated tick device and any global clockevent will be bumped out into broadcast mode, fixing the problem described above. Reported-and-tested-by: Mark Rutland Signed-off-by: Stephen Boyd Tested-by: soren.brinkmann@xilinx.com Cc: John Stultz Cc: Daniel Lezcano Cc: linux-arm-kernel@lists.infradead.org Cc: John Stultz Link: http://lkml.kernel.org/r/20130613183950.GA32061@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5edfb4806032..edd45f64162f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -243,8 +243,13 @@ static bool tick_check_preferred(struct clock_event_device *curdev, return false; } - /* Use the higher rated one */ - return !curdev || newdev->rating > curdev->rating; + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* -- cgit v1.2.3 From 9871bf9550d25e488cd2f0ce958d3f59f17fa720 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:47 -0700 Subject: cgroup: prefix global variables with "cgroup_" Global variable names in kernel/cgroup.c are asking for trouble - subsys, roots, rootnode and so on. Rename them to have "cgroup_" prefix. * s/subsys/cgroup_subsys/ * s/rootnode/cgroup_dummy_root/ * s/dummytop/cgroup_cummy_top/ * s/roots/cgroup_roots/ * s/root_count/cgroup_root_count/ This patch is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 153 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 77 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1051c1f69674..8f296b83b6a3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -96,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex); */ #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { +static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { #include }; /* - * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the - * subsystems that are otherwise unattached - it never has more than a - * single cgroup, and all tasks are part of that cgroup. + * The dummy hierarchy, reserved for the subsystems that are otherwise + * unattached - it never has more than a single cgroup, and all tasks are + * part of that cgroup. */ -static struct cgroupfs_root rootnode; +static struct cgroupfs_root cgroup_dummy_root; + +/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ +static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; /* * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. @@ -183,8 +186,8 @@ struct cgroup_event { /* The list of hierarchy roots */ -static LIST_HEAD(roots); -static int root_count; +static LIST_HEAD(cgroup_roots); +static int cgroup_root_count; /* * Hierarchy ID allocation and mapping. It follows the same exclusion @@ -193,9 +196,6 @@ static int root_count; */ static DEFINE_IDR(cgroup_hierarchy_idr); -/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ -#define dummytop (&rootnode.top_cgroup) - static struct cgroup_name root_cgroup_name = { .name = "/" }; /* @@ -268,7 +268,7 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) /* for_each_active_root() allows you to iterate across the active hierarchies */ #define for_each_active_root(_root) \ -list_for_each_entry(_root, &roots, root_list) +list_for_each_entry(_root, &cgroup_roots, root_list) static inline struct cgroup *__d_cgrp(struct dentry *dentry) { @@ -650,7 +650,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ - if (allocate_cgrp_cset_links(root_count, &tmp_links) < 0) { + if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } @@ -1000,7 +1000,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (!(bit & added_mask)) continue; /* @@ -1009,7 +1009,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, * ensure that subsystems won't disappear once selected. */ BUG_ON(ss == NULL); - if (ss->root != &rootnode) { + if (ss->root != &cgroup_dummy_root) { /* Subsystem isn't free */ return -EBUSY; } @@ -1024,15 +1024,15 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Process each subsystem */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; unsigned long bit = 1UL << i; if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); - BUG_ON(!dummytop->subsys[i]); - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - cgrp->subsys[i] = dummytop->subsys[i]; + BUG_ON(!cgroup_dummy_top->subsys[i]); + BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; @@ -1042,14 +1042,14 @@ static int rebind_subsystems(struct cgroupfs_root *root, } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); + BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) - ss->bind(dummytop); - dummytop->subsys[i]->cgroup = dummytop; + ss->bind(cgroup_dummy_top); + cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; - subsys[i]->root = &rootnode; - list_move(&ss->sibling, &rootnode.subsys_list); + cgroup_subsys[i]->root = &cgroup_dummy_root; + list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); } else if (bit & final_subsys_mask) { @@ -1112,10 +1112,10 @@ struct cgroup_sb_opts { }; /* - * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. This function takes - * refcounts on subsystems to be used, unless it returns error, in which case - * no refcounts are taken. + * Convert a hierarchy specifier into a bitmask of subsystems and + * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] + * array. This function takes refcounts on subsystems to be used, unless it + * returns error, in which case no refcounts are taken. */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { @@ -1201,7 +1201,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) } for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (ss == NULL) continue; if (strcmp(token, ss->name)) @@ -1228,7 +1228,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (ss == NULL) continue; if (ss->disabled) @@ -1284,7 +1284,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!(bit & opts->subsys_mask)) continue; - if (!try_module_get(subsys[i]->module)) { + if (!try_module_get(cgroup_subsys[i]->module)) { module_pin_failed = true; break; } @@ -1301,7 +1301,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!(bit & opts->subsys_mask)) continue; - module_put(subsys[i]->module); + module_put(cgroup_subsys[i]->module); } return -ENOENT; } @@ -1317,7 +1317,7 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask) if (!(bit & subsys_mask)) continue; - module_put(subsys[i]->module); + module_put(cgroup_subsys[i]->module); } } @@ -1648,8 +1648,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* EBUSY should be the only error here */ BUG_ON(ret); - list_add(&root->root_list, &roots); - root_count++; + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; @@ -1746,7 +1746,7 @@ static void cgroup_kill_sb(struct super_block *sb) { if (!list_empty(&root->root_list)) { list_del(&root->root_list); - root_count--; + cgroup_root_count--; } cgroup_exit_root_id(root); @@ -2807,7 +2807,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, u64 update_before; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &rootnode || + if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); return; @@ -4186,7 +4186,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->cgroup = cgrp; css->flags = 0; css->id = NULL; - if (cgrp == dummytop) + if (cgrp == cgroup_dummy_top) css->flags |= CSS_ROOT; BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; @@ -4615,12 +4615,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; - css = ss->css_alloc(dummytop); + list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); + ss->root = &cgroup_dummy_root; + css = ss->css_alloc(cgroup_dummy_top); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, dummytop); + init_cgroup_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4635,7 +4635,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, dummytop)); + BUG_ON(online_css(ss, cgroup_dummy_top)); mutex_unlock(&cgroup_mutex); @@ -4681,7 +4681,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) */ if (ss->module == NULL) { /* a sanity check */ - BUG_ON(subsys[ss->subsys_id] != ss); + BUG_ON(cgroup_subsys[ss->subsys_id] != ss); return 0; } @@ -4689,26 +4689,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); mutex_lock(&cgroup_mutex); - subsys[ss->subsys_id] = ss; + cgroup_subsys[ss->subsys_id] = ss; /* * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the rootnode + * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(dummytop); + css = ss->css_alloc(cgroup_dummy_top); if (IS_ERR(css)) { - /* failure case - need to deassign the subsys[] slot. */ - subsys[ss->subsys_id] = NULL; + /* failure case - need to deassign the cgroup_subsys[] slot. */ + cgroup_subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; + list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); + ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, dummytop); + init_cgroup_css(css, ss, cgroup_dummy_top); /* init_idr must be after init_cgroup_css because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); @@ -4739,7 +4739,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, dummytop); + ret = online_css(ss, cgroup_dummy_top); if (ret) goto err_unload; @@ -4774,27 +4774,28 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * try_module_get in parse_cgroupfs_options should ensure that it * doesn't start being used while we're killing it off. */ - BUG_ON(ss->root != &rootnode); + BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); - offline_css(ss, dummytop); + offline_css(ss, cgroup_dummy_top); if (ss->use_id) idr_destroy(&ss->idr); /* deassign the subsys_id */ - subsys[ss->subsys_id] = NULL; + cgroup_subsys[ss->subsys_id] = NULL; - /* remove subsystem from rootnode's list of subsystems */ + /* remove subsystem from the dummy root's list of subsystems */ list_del_init(&ss->sibling); /* - * disentangle the css from all css_sets attached to the dummytop. as - * in loading, we need to pay our respects to the hashtable gods. + * disentangle the css from all css_sets attached to the dummy + * top. as in loading, we need to pay our respects to the hashtable + * gods. */ write_lock(&css_set_lock); - list_for_each_entry(link, &dummytop->cset_links, cset_link) { + list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { struct css_set *cset = link->cset; unsigned long key; @@ -4806,13 +4807,13 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to - * free before marking as null because ss->css_free needs the - * cgrp->subsys pointer to find their state. note that this also - * takes care of freeing the css_id. + * remove subsystem's css from the cgroup_dummy_top and free it - + * need to free before marking as null because ss->css_free needs + * the cgrp->subsys pointer to find their state. note that this + * also takes care of freeing the css_id. */ - ss->css_free(dummytop); - dummytop->subsys[ss->subsys_id] = NULL; + ss->css_free(cgroup_dummy_top); + cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); } @@ -4832,17 +4833,17 @@ int __init cgroup_init_early(void) INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; - init_cgroup_root(&rootnode); - root_count = 1; + init_cgroup_root(&cgroup_dummy_root); + cgroup_root_count = 1; init_task.cgroups = &init_css_set; init_cgrp_cset_link.cset = &init_css_set; - init_cgrp_cset_link.cgrp = dummytop; - list_add(&init_cgrp_cset_link.cset_link, &rootnode.top_cgroup.cset_links); + init_cgrp_cset_link.cgrp = cgroup_dummy_top; + list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; /* at bootup time, we don't worry about modular subsystems */ if (!ss || ss->module) @@ -4881,7 +4882,7 @@ int __init cgroup_init(void) return err; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; /* at bootup time, we don't worry about modular subsystems */ if (!ss || ss->module) @@ -4900,7 +4901,7 @@ int __init cgroup_init(void) mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); - BUG_ON(cgroup_init_root_id(&rootnode)); + BUG_ON(cgroup_init_root_id(&cgroup_dummy_root)); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -5004,7 +5005,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) */ mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (ss == NULL) continue; seq_printf(m, "%s\t%d\t%d\t%d\n", @@ -5101,7 +5102,7 @@ void cgroup_post_fork(struct task_struct *child) * can't touch that. */ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (ss->fork) ss->fork(child); @@ -5172,7 +5173,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) * subsystems, see cgroup_post_fork() for details. */ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; if (ss->exit) { struct cgroup *old_cgrp = @@ -5291,7 +5292,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys *ss = cgroup_subsys[i]; /* * cgroup_disable, being at boot time, can't -- cgit v1.2.3 From a8a648c4acee2095262f7fa65b0d8a68a03c32e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:47 -0700 Subject: cgroup: remove cgroup->actual_subsys_mask cgroup curiously has two subsystem masks, ->subsys_mask and ->actual_subsys_mask. The latter only exists because the new target subsys_mask is passed into rebind_subsystems() via @root>subsys_mask. rebind_subsystems() needs to know what the current mask is to decide how to reach the target mask so ->actual_subsys_mask is used as the temp location to remember the current state. Adding a temporary field to a permanent data structure is rather silly and can be misleading. Update rebind_subsystems() to take @added_mask and @removed_mask instead and remove @root->actual_subsys_mask. This patch shouldn't introduce any behavior changes. v2: Comment and description updated as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f296b83b6a3..67fc953c816a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -986,17 +986,14 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_subsys_mask) + unsigned long added_mask, unsigned removed_mask) { - unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_mask = root->actual_subsys_mask & ~final_subsys_mask; - added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; @@ -1032,27 +1029,33 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!cgroup_dummy_top->subsys[i]); BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); + /* refcount was already taken, and we're keeping it */ + root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + if (ss->bind) ss->bind(cgroup_dummy_top); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); + /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_subsys_mask) { + root->subsys_mask &= ~bit; + } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); @@ -1069,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); } } - root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; return 0; } @@ -1343,7 +1345,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); @@ -1365,7 +1367,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) */ cgroup_clear_directory(cgrp->dentry, false, removed_mask); - ret = rebind_subsystems(root, opts.subsys_mask); + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ cgroup_populate_dir(cgrp, false, removed_mask); @@ -1634,7 +1636,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_mask); + ret = rebind_subsystems(root, root->subsys_mask, 0); if (ret == -EBUSY) { free_cgrp_cset_links(&tmp_links); goto unlock_drop; @@ -1727,7 +1729,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0); + ret = rebind_subsystems(root, 0, root->subsys_mask); /* Shouldn't be able to fail ... */ BUG_ON(ret); -- cgit v1.2.3 From b326f9d0dbd066b0aafbe88e6011a680a36de6e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:48 -0700 Subject: cgroup: clean up find_css_set() and friends find_css_set() passes uninitialized on-stack template[] array to find_existing_css_set() which sets the entries for all subsystems. Passing around an uninitialized array is a bit icky and we want to introduce an iterator which only iterates loaded subsystems. Let's initialize it on definition. While at it, also make the following cosmetic cleanups. * Convert to proper /** comments. * Reorder variable declarations. * Replace comment on synchronization with lockdep_assert_held(). This patch doesn't make any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 67fc953c816a..c8d3175c429c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -434,7 +434,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) __put_css_set(cset, 1); } -/* +/** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task @@ -506,27 +506,20 @@ static bool compare_css_sets(struct css_set *cset, return true; } -/* - * find_existing_css_set() is a helper for - * find_css_set(), and checks to see whether an existing - * css_set is suitable. - * - * oldcg: the cgroup group that we're using before the cgroup - * transition - * - * cgrp: the cgroup that we're moving into - * - * template: location in which to build the desired set of subsystem - * state objects for the new cgroup group +/** + * find_existing_css_set - init css array and find the matching css_set + * @old_cset: the css_set that we're using before the cgroup transition + * @cgrp: the cgroup that we're moving into + * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { - int i; struct cgroupfs_root *root = cgrp->root; struct css_set *cset; unsigned long key; + int i; /* * Build the set of subsystem state objects that we want to see in the @@ -618,22 +611,25 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); } -/* - * find_css_set() takes an existing cgroup group and a - * cgroup object, and returns a css_set object that's - * equivalent to the old group, but with the given cgroup - * substituted into the appropriate hierarchy. Must be called with - * cgroup_mutex held +/** + * find_css_set - return a new css_set with one cgroup updated + * @old_cset: the baseline css_set + * @cgrp: the cgroup to be updated + * + * Return a new css_set that's equivalent to @old_cset, but with @cgrp + * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; struct list_head tmp_links; struct cgrp_cset_link *link; unsigned long key; + lockdep_assert_held(&cgroup_mutex); + /* First see if we already have a cgroup group that matches * the desired set */ read_lock(&css_set_lock); -- cgit v1.2.3 From 5549c497913ad860d3dff4386c6423268bb85693 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:48 -0700 Subject: cgroup: s/for_each_subsys()/for_each_root_subsys()/ for_each_subsys() walks over subsystems attached to a hierarchy and we're gonna add iterators which walk over all available subsystems. Rename for_each_subsys() to for_each_root_subsys() so that it's more appropriately named and for_each_subsys() can be used to iterate all subsystems. While at it, remove unnecessary underbar prefix from macro arguments, put them inside parentheses, and adjust indentation for the two for_each_*() macros. This patch is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c8d3175c429c..605cb13a1574 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -259,16 +259,13 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -/* - * for_each_subsys() allows you to iterate on each subsystem attached to - * an active hierarchy - */ -#define for_each_subsys(_root, _ss) \ -list_for_each_entry(_ss, &_root->subsys_list, sibling) +/* iterate each subsystem attached to a hierarchy */ +#define for_each_root_subsys(root, ss) \ + list_for_each_entry((ss), &(root)->subsys_list, sibling) -/* for_each_active_root() allows you to iterate across the active hierarchies */ -#define for_each_active_root(_root) \ -list_for_each_entry(_root, &cgroup_roots, root_list) +/* iterate across the active hierarchies */ +#define for_each_active_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) static inline struct cgroup *__d_cgrp(struct dentry *dentry) { @@ -828,7 +825,7 @@ static void cgroup_free_fn(struct work_struct *work) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) + for_each_root_subsys(cgrp->root, ss) ss->css_free(cgrp); cgrp->root->number_of_cgroups--; @@ -944,7 +941,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; - for_each_subsys(cgrp->root, ss) { + for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; if (!test_bit(ss->subsys_id, &subsys_mask)) continue; @@ -1078,7 +1075,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; mutex_lock(&cgroup_root_mutex); - for_each_subsys(root, ss) + for_each_root_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -2054,7 +2051,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 1: check that we can legitimately attach to the cgroup. */ - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(cgrp, &tset); if (retval) { @@ -2091,7 +2088,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 4: do subsystem attach callbacks. */ - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { if (ss->attach) ss->attach(cgrp, &tset); } @@ -2111,7 +2108,7 @@ out_put_css_set_refs: } out_cancel_attach: if (retval) { - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { if (ss == failed_ss) break; if (ss->cancel_attach) @@ -4137,7 +4134,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, } /* process cftsets of each subsystem */ - for_each_subsys(cgrp->root, ss) { + for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; if (!test_bit(ss->subsys_id, &subsys_mask)) continue; @@ -4147,7 +4144,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, } /* This cgroup is ready now */ - for_each_subsys(cgrp->root, ss) { + for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; /* * Update id->css pointer and make this css visible from @@ -4294,7 +4291,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; css = ss->css_alloc(cgrp); @@ -4333,14 +4330,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, root->number_of_cgroups++; /* each css holds a ref to the cgroup's dentry */ - for_each_subsys(root, ss) + for_each_root_subsys(root, ss) dget(dentry); /* hold a ref to the parent's dentry */ dget(parent->dentry); /* creation succeeded, notify subsystems */ - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { err = online_css(ss, cgrp); if (err) goto err_destroy; @@ -4365,7 +4362,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; err_free_all: - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; if (css) { @@ -4478,7 +4475,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * be killed. */ atomic_set(&cgrp->css_kill_cnt, 1); - for_each_subsys(cgrp->root, ss) { + for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; /* @@ -4552,7 +4549,7 @@ static void cgroup_offline_fn(struct work_struct *work) * css_tryget() is guaranteed to fail now. Tell subsystems to * initate destruction. */ - for_each_subsys(cgrp->root, ss) + for_each_root_subsys(cgrp->root, ss) offline_css(ss, cgrp); /* @@ -4562,7 +4559,7 @@ static void cgroup_offline_fn(struct work_struct *work) * whenever that may be, the extra dentry ref is put so that dentry * destruction happens only after all css's are released. */ - for_each_subsys(cgrp->root, ss) + for_each_root_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); /* delete this cgroup from parent->children */ @@ -4967,7 +4964,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) int count = 0; seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(root, ss) + for_each_root_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From fbab62c5cd57a6acd9ed80903532c86897d2d560 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 14 Jun 2013 18:40:49 +0200 Subject: irqdomain: Use irq_get_trigger_type() to get IRQ flags Use irq_get_trigger_type() to get the IRQ trigger type flags instead calling irqd_get_trigger_type(irq_desc_get_irq_data(virq)) Signed-off-by: Javier Martinez Canillas Acked-by: Grant Likely Cc: Linus Walleij Cc: Samuel Ortiz Cc: Jason Cooper Cc: Andrew Lunn Cc: Russell King Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/1371228049-27080-8-git-send-email-javier.martinez@collabora.co.uk Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1db9e70f5488..489921e6242a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -687,7 +687,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && - type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + type != irq_get_trigger_type(virq)) irq_set_irq_type(virq, type); return virq; } -- cgit v1.2.3 From 82fe9b0da0d50e2795a49c268676fd132cbc3eea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 11:53:37 -0700 Subject: cgroup: move init_css_set initialization inside cgroup_mutex cgroup_init() was doing init_css_set initialization outside cgroup_mutex, which is fine but we want to add lockdep annotation on subsystem iterations and cgroup_init() will trigger it spuriously. Move init_css_set initialization inside cgroup_mutex. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 605cb13a1574..3409698bd9fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4888,14 +4888,14 @@ int __init cgroup_init(void) cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); - /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + /* Add init_css_set to the hash table */ + key = css_set_hash(init_css_set.subsys); + hash_add(css_set_table, &init_css_set.hlist, key); + BUG_ON(cgroup_init_root_id(&cgroup_dummy_root)); mutex_unlock(&cgroup_root_mutex); -- cgit v1.2.3 From 30159ec7a9db7f3c91e2b27e66389c49302efd5c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 11:53:37 -0700 Subject: cgroup: implement for_each_[builtin_]subsys() There are quite a few places where all loaded [builtin] subsys are iterated. Implement for_each_[builtin_]subsys() and replace manual iterations with those to simplify those places a bit. The new iterators automatically skip NULL subsystems. This shouldn't cause any functional difference. Iteration loops which scan all subsystems and then skipping modular ones explicitly are converted to use for_each_builtin_subsys(). While at it, reorder variable declarations and adjust whitespaces a bit in the affected functions. v2: Add lockdep_assert_held() in for_each_subsys() and add comments about synchronization as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 147 +++++++++++++++++++++++++++----------------------------- 1 file changed, 71 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3409698bd9fd..cef688128fb8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -259,6 +259,31 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +/** + * for_each_subsys - iterate all loaded cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * + * Should be called under cgroup_mutex. + */ +#define for_each_subsys(ss, i) \ + for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + !((ss) = cgroup_subsys[i]); })) { } \ + else + +/** + * for_each_builtin_subsys - iterate all built-in cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end + * + * Bulit-in subsystems are always present and iteration itself doesn't + * require any synchronization. + */ +#define for_each_builtin_subsys(ss, i) \ + for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[i]) || true); (i)++) + /* iterate each subsystem attached to a hierarchy */ #define for_each_root_subsys(root, ss) \ list_for_each_entry((ss), &(root)->subsys_list, sibling) @@ -356,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { - int i; unsigned long key = 0UL; + struct cgroup_subsys *ss; + int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; @@ -514,6 +540,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup_subsys_state *template[]) { struct cgroupfs_root *root = cgrp->root; + struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; @@ -523,7 +550,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * new css_set. while subsystems can change globally, the entries here * won't change, so no need for locking. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new @@ -982,23 +1009,19 @@ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { struct cgroup *cgrp = &root->top_cgroup; + struct cgroup_subsys *ss; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); /* Check that any added subsystems are currently free */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = cgroup_subsys[i]; + if (!(bit & added_mask)) continue; - /* - * Nobody should tell us to do a subsys that doesn't exist: - * parse_cgroupfs_options should catch that case and refcounts - * ensure that subsystems won't disappear once selected. - */ - BUG_ON(ss == NULL); + if (ss->root != &cgroup_dummy_root) { /* Subsystem isn't free */ return -EBUSY; @@ -1013,12 +1036,11 @@ static int rebind_subsystems(struct cgroupfs_root *root, return -EBUSY; /* Process each subsystem */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; + if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); BUG_ON(!cgroup_dummy_top->subsys[i]); BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); @@ -1034,7 +1056,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); @@ -1050,7 +1071,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->subsys_mask &= ~bit; } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ - BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); /* * a refcount was taken, but we already had one, so @@ -1117,8 +1137,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - int i; bool module_pin_failed = false; + struct cgroup_subsys *ss; + int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1195,10 +1216,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - if (ss == NULL) - continue; + for_each_subsys(ss, i) { if (strcmp(token, ss->name)) continue; if (ss->disabled) @@ -1221,16 +1239,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * otherwise if 'none', 'name=' and a subsystem name options * were not specified, let's default to 'all' */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - if (ss == NULL) - continue; - if (ss->disabled) - continue; - set_bit(i, &opts->subsys_mask); - } - } + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + set_bit(i, &opts->subsys_mask); /* Consistency checks */ @@ -1274,10 +1286,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * take duplicate reference counts on a subsystem that's already used, * but rebind_subsystems handles this case. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) + for_each_subsys(ss, i) { + if (!(opts->subsys_mask & (1UL << i))) continue; if (!try_module_get(cgroup_subsys[i]->module)) { module_pin_failed = true; @@ -1306,11 +1316,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) static void drop_parsed_module_refcounts(unsigned long subsys_mask) { + struct cgroup_subsys *ss; int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - if (!(bit & subsys_mask)) + for_each_subsys(ss, i) { + if (!(subsys_mask & (1UL << i))) continue; module_put(cgroup_subsys[i]->module); } @@ -4822,7 +4832,9 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); */ int __init cgroup_init_early(void) { + struct cgroup_subsys *ss; int i; + atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); @@ -4837,13 +4849,8 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - - /* at bootup time, we don't worry about modular subsystems */ - if (!ss || ss->module) - continue; - + /* at bootup time, we don't worry about modular subsystems */ + for_each_builtin_subsys(ss, i) { BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->css_alloc); @@ -4868,20 +4875,15 @@ int __init cgroup_init_early(void) */ int __init cgroup_init(void) { - int err; - int i; + struct cgroup_subsys *ss; unsigned long key; + int i, err; err = bdi_init(&cgroup_backing_dev_info); if (err) return err; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - - /* at bootup time, we don't worry about modular subsystems */ - if (!ss || ss->module) - continue; + for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) @@ -4990,6 +4992,7 @@ out: /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { + struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -4999,14 +5002,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) * subsys/hierarchy state. */ mutex_lock(&cgroup_mutex); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - if (ss == NULL) - continue; + + for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); - } + mutex_unlock(&cgroup_mutex); return 0; } @@ -5060,6 +5061,7 @@ void cgroup_fork(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + struct cgroup_subsys *ss; int i; /* @@ -5096,12 +5098,9 @@ void cgroup_post_fork(struct task_struct *child) * of the array can be freed at module unload, so we * can't touch that. */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - + for_each_builtin_subsys(ss, i) if (ss->fork) ss->fork(child); - } } } @@ -5142,6 +5141,7 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { + struct cgroup_subsys *ss; struct css_set *cset; int i; @@ -5167,13 +5167,12 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) * fork/exit callbacks are supported only for builtin * subsystems, see cgroup_post_fork() for details. */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - + for_each_builtin_subsys(ss, i) { if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cset->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(cgrp, old_cgrp, tsk); } } @@ -5280,23 +5279,19 @@ static void cgroup_release_agent(struct work_struct *work) static int __init cgroup_disable(char *str) { - int i; + struct cgroup_subsys *ss; char *token; + int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = cgroup_subsys[i]; - - /* - * cgroup_disable, being at boot time, can't - * know about module subsystems, so we don't - * worry about them. - */ - if (!ss || ss->module) - continue; + /* + * cgroup_disable, being at boot time, can't know about + * module subsystems, so we don't worry about them. + */ + for_each_builtin_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" -- cgit v1.2.3 From fc76df706123602214da494ba98bccea83e2cfff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 11:53:37 -0700 Subject: cgroup: reserve ID 0 for dummy_root and 1 for unified hierarchy Before 1a57423166 ("cgroup: make hierarchy_id use cyclic idr"), hierarchy IDs were allocated from 0. As the dummy hierarchy was always the one first initialized, it got assigned 0 and all other hierarchies from 1. The patch accidentally changed the minimum useable ID to 2. Let's restore ID 0 for dummy_root and while at it reserve 1 for unified hierarchy. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cef688128fb8..f9c99abc38ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1425,14 +1425,15 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } -static int cgroup_init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) { int id; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_root_mutex); - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 2, 0, GFP_KERNEL); + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, + GFP_KERNEL); if (id < 0) return id; @@ -1635,7 +1636,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = cgroup_init_root_id(root); + /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ + ret = cgroup_init_root_id(root, 2, 0); if (ret) goto unlock_drop; @@ -4898,7 +4900,7 @@ int __init cgroup_init(void) key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_init_root_id(&cgroup_dummy_root)); + BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From 13d60f4b6ab5b702dc8d2ee20999f98a93728aec Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 25 Jun 2013 21:19:31 +0800 Subject: futex: Take hugepages into account when generating futex_key The futex_keys of process shared futexes are generated from the page offset, the mapping host and the mapping index of the futex user space address. This should result in an unique identifier for each futex. Though this is not true when futexes are located in different subpages of an hugepage. The reason is, that the mapping index for all those futexes evaluates to the index of the base page of the hugetlbfs mapping. So a futex at offset 0 of the hugepage mapping and another one at offset PAGE_SIZE of the same hugepage mapping have identical futex_keys. This happens because the futex code blindly uses page->index. Steps to reproduce the bug: 1. Map a file from hugetlbfs. Initialize pthread_mutex1 at offset 0 and pthread_mutex2 at offset PAGE_SIZE of the hugetlbfs mapping. The mutexes must be initialized as PTHREAD_PROCESS_SHARED because PTHREAD_PROCESS_PRIVATE mutexes are not affected by this issue as their keys solely depend on the user space address. 2. Lock mutex1 and mutex2 3. Create thread1 and in the thread function lock mutex1, which results in thread1 blocking on the locked mutex1. 4. Create thread2 and in the thread function lock mutex2, which results in thread2 blocking on the locked mutex2. 5. Unlock mutex2. Despite the fact that mutex2 got unlocked, thread2 still blocks on mutex2 because the futex_key points to mutex1. To solve this issue we need to take the normal page index of the page which contains the futex into account, if the futex is in an hugetlbfs mapping. In other words, we calculate the normal page mapping index of the subpage in the hugetlbfs mapping. Mappings which are not based on hugetlbfs are not affected and still use page->index. Thanks to Mel Gorman who provided a patch for adding proper evaluation functions to the hugetlbfs code to avoid exposing hugetlbfs specific details to the futex code. [ tglx: Massaged changelog ] Signed-off-by: Zhang Yi Reviewed-by: Jiang Biao Tested-by: Ma Chenggong Reviewed-by: 'Mel Gorman' Acked-by: 'Darren Hart' Cc: 'Peter Zijlstra' Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/000101ce71a6%24a83c5880%24f8b50980%24@com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c94..49dacfb45745 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -365,7 +366,7 @@ again: } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = page_head->mapping->host; - key->shared.pgoff = page_head->index; + key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); -- cgit v1.2.3 From 88c8004fd3a5fdd2378069de86b90b21110d33a4 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Wed, 1 May 2013 18:35:05 -0700 Subject: futex: Use freezable blocking call Avoid waking up every thread sleeping in a futex_wait call during suspend and resume by calling a freezable blocking call. Previous patches modified the freezer to avoid sending wakeups to threads that are blocked in freezable blocking calls. This call was selected to be converted to a freezable call because it doesn't hold any locks or release any resources when interrupted that might be needed by another freezing task or a kernel driver during suspend, and is a common site where idle userspace tasks are blocked. Signed-off-by: Colin Cross Cc: Rafael J. Wysocki Cc: arve@android.com Cc: Tejun Heo Cc: Oleg Nesterov Cc: Darren Hart Cc: Randy Dunlap Cc: Al Viro Link: http://lkml.kernel.org/r/1367458508-9133-8-git-send-email-ccross@android.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 49dacfb45745..c3a1a55a5214 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -62,6 +62,7 @@ #include #include #include +#include #include @@ -1808,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } -- cgit v1.2.3 From d0667186eb0eab78dcca9f75af6ed03873ca8d9f Mon Sep 17 00:00:00 2001 From: JunweiZhang Date: Wed, 26 Jun 2013 16:40:05 +0800 Subject: kernel: remove unnecessary head file ip_vs.h is not necessary for sysctl_binary.c. prepare for the next patch to avoid compile issue. Signed-off-by: JunweiZhang Signed-off-by: Nicolas Dichtel Reviewed-by: Julian Anastasov Signed-off-by: Simon Horman --- kernel/sysctl_binary.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index aea4a9ea6fc8..b609213ca9a2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -3,7 +3,6 @@ #include "../fs/xfs/xfs_sysctl.h" #include #include -#include #include #include #include -- cgit v1.2.3 From a41b56efa70e060f650aeb54740aaf52044a1ead Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 20 Jun 2013 13:31:05 +0200 Subject: arch: Make __mutex_fastpath_lock_retval return whether fastpath succeeded or not This will allow me to call functions that have multiple arguments if fastpath fails. This is required to support ticket mutexes, because they need to be able to pass an extra argument to the fail function. Originally I duplicated the functions, by adding __mutex_fastpath_lock_retval_arg. This ended up being just a duplication of the existing function, so a way to test if fastpath was called ended up being better. This also cleaned up the reservation mutex patch some by being able to call an atomic_set instead of atomic_xchg, and making it easier to detect if the wrong unlock function was previously used. Signed-off-by: Maarten Lankhorst Acked-by: Peter Zijlstra Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: robclark@gmail.com Cc: rostedt@goodmis.org Cc: daniel@ffwll.ch Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20130620113105.4001.83929.stgit@patser Signed-off-by: Ingo Molnar --- kernel/mutex.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index ad53a664f113..42f8dda2467b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -494,10 +494,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count) * mutex_lock_interruptible() and mutex_trylock(). */ static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); +__mutex_lock_killable_slowpath(struct mutex *lock); static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); +__mutex_lock_interruptible_slowpath(struct mutex *lock); /** * mutex_lock_interruptible - acquire the mutex, interruptible @@ -515,12 +515,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); @@ -530,12 +530,12 @@ int __sched mutex_lock_killable(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_killable_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); @@ -548,18 +548,14 @@ __mutex_lock_slowpath(atomic_t *lock_count) } static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) +__mutex_lock_killable_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) +__mutex_lock_interruptible_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } #endif -- cgit v1.2.3 From 040a0a37100563754bb1fee6ff6427420bcfa609 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Mon, 24 Jun 2013 10:30:04 +0200 Subject: mutex: Add support for wound/wait style locks Wound/wait mutexes are used when other multiple lock acquisitions of a similar type can be done in an arbitrary order. The deadlock handling used here is called wait/wound in the RDBMS literature: The older tasks waits until it can acquire the contended lock. The younger tasks needs to back off and drop all the locks it is currently holding, i.e. the younger task is wounded. For full documentation please read Documentation/ww-mutex-design.txt. References: https://lwn.net/Articles/548909/ Signed-off-by: Maarten Lankhorst Acked-by: Daniel Vetter Acked-by: Rob Clark Acked-by: Peter Zijlstra Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: rostedt@goodmis.org Cc: daniel@ffwll.ch Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/51C8038C.9000106@canonical.com Signed-off-by: Ingo Molnar --- kernel/mutex.c | 318 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 302 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 42f8dda2467b..fc801aafe8fd 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock) EXPORT_SYMBOL(mutex_unlock); +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } + +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(&lock->base); +#endif + __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + + if (!hold_ctx) + return 0; + + if (unlikely(ctx == hold_ctx)) + return -EALREADY; + + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + /* * Lock a mutex (possibly interruptible), slowpath: */ -static inline int __sched +static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + int ret; preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; + if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + break; + } + /* * If there's an owner, wait for it to either * release the lock or go to sleep. @@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); + if (!__builtin_constant_p(ww_ctx == NULL)) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); preempt_enable(); @@ -371,15 +543,16 @@ slowpath: * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { - mutex_remove_waiter(lock, &waiter, - task_thread_info(task)); - mutex_release(&lock->dep_map, 1, ip); - spin_unlock_mutex(&lock->wait_lock, flags); + ret = -EINTR; + goto err; + } - debug_mutex_free_waiter(&waiter); - preempt_enable(); - return -EINTR; + if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + ret = __mutex_lock_check_stamp(lock, ww_ctx); + if (ret) + goto err; } + __set_task_state(task, state); /* didn't get the lock, go to sleep: */ @@ -394,6 +567,30 @@ done: mutex_remove_waiter(lock, &waiter, current_thread_info()); mutex_set_owner(lock); + if (!__builtin_constant_p(ww_ctx == NULL)) { + struct ww_mutex *ww = container_of(lock, + struct ww_mutex, + base); + struct mutex_waiter *cur; + + /* + * This branch gets optimized out for the common case, + * and is only important for ww_mutex_lock. + */ + + ww_mutex_lock_acquired(ww, ww_ctx); + ww->ctx = ww_ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->wait_list, list) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + } + /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -404,6 +601,14 @@ done: preempt_enable(); return 0; + +err: + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + spin_unlock_mutex(&lock->wait_lock, flags); + debug_mutex_free_waiter(&waiter); + mutex_release(&lock->dep_map, 1, ip); + preempt_enable(); + return ret; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -411,7 +616,8 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -420,7 +626,8 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + 0, nest, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -429,7 +636,8 @@ int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -438,10 +646,30 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_); + subclass, NULL, _RET_IP_, NULL); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + might_sleep(); + return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx); +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + might_sleep(); + return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx); +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); + #endif /* @@ -544,20 +772,39 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, + NULL, _RET_IP_, NULL); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL); +} + +static noinline int __sched +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx); } + +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx); +} + #endif /* @@ -613,6 +860,45 @@ int __sched mutex_trylock(struct mutex *lock) } EXPORT_SYMBOL(mutex_trylock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif + /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 * @cnt: the atomic which we are to dec -- cgit v1.2.3 From 230100276955529d5a7c69207421756b9a61a8e5 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 20 Jun 2013 13:31:17 +0200 Subject: mutex: Add w/w mutex slowpath debugging Injects EDEADLK conditions at pseudo-random interval, with exponential backoff up to UINT_MAX (to ensure that every lock operation still completes in a reasonable time). This way we can test the wound slowpath even for ww mutex users where contention is never expected, and the ww deadlock avoidance algorithm is only needed for correctness against malicious userspace. An example would be protecting kernel modesetting properties, which thanks to single-threaded X isn't really expected to contend, ever. I've looked into using the CONFIG_FAULT_INJECTION infrastructure, but decided against it for two reasons: - EDEADLK handling is mandatory for ww mutex users and should never affect the outcome of a syscall. This is in contrast to -ENOMEM injection. So fine configurability isn't required. - The fault injection framework only allows to set a simple probability for failure. Now the probability that a ww mutex acquire stage with N locks will never complete (due to too many injected EDEADLK backoffs) is zero. But the expected number of ww_mutex_lock operations for the completely uncontended case would be O(exp(N)). The per-acuiqire ctx exponential backoff solution choosen here only results in O(log N) overhead due to injection and so O(log N * N) lock operations. This way we can fail with high probability (and so have good test coverage even for fancy backoff and lock acquisition paths) without running into patalogical cases. Note that EDEADLK will only ever be injected when we managed to acquire the lock. This prevents any behaviour changes for users which rely on the EALREADY semantics. Signed-off-by: Daniel Vetter Signed-off-by: Maarten Lankhorst Acked-by: Peter Zijlstra Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: rostedt@goodmis.org Cc: daniel@ffwll.ch Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20130620113117.4001.21681.stgit@patser Signed-off-by: Ingo Molnar --- kernel/mutex.c | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index fc801aafe8fd..e581ada5faf4 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -651,22 +651,60 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH + unsigned tmp; + + if (ctx->deadlock_inject_countdown-- == 0) { + tmp = ctx->deadlock_inject_interval; + if (tmp > UINT_MAX/4) + tmp = UINT_MAX; + else + tmp = tmp*2 + tmp + tmp/2; + + ctx->deadlock_inject_interval = tmp; + ctx->deadlock_inject_countdown = tmp; + ctx->contending_lock = lock; + + ww_mutex_unlock(lock); + + return -EDEADLK; + } +#endif + + return 0; +} int __sched __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { + int ret; + might_sleep(); - return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, + ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, &ctx->dep_map, _RET_IP_, ctx); + if (!ret && ctx->acquired > 0) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; } EXPORT_SYMBOL_GPL(__ww_mutex_lock); int __sched __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { + int ret; + might_sleep(); - return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx); + + if (!ret && ctx->acquired > 0) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; } EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); -- cgit v1.2.3 From 1672d040709b789671c0502e7aac9d632c2f9175 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 18:04:54 -0700 Subject: cgroup: fix cgroupfs_root early destruction path cgroupfs_root used to have ->actual_subsys_mask in addition to ->subsys_mask. a8a648c4ac ("cgroup: remove cgroup->actual_subsys_mask") removed it noting that the subsys_mask is essentially temporary and doesn't belong in cgroupfs_root; however, the patch made it impossible to tell whether a cgroupfs_root actually has the subsystems bound or just have the bits set leading to the following BUG when trying to mount with subsystems which are already mounted elsewhere. kernel BUG at kernel/cgroup.c:1038! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ... CPU: 1 PID: 7973 Comm: mount Tainted: G W 3.10.0-rc7-next-20130625-sasha-00011-g1c1dc0e #1105 task: ffff880fc0ae8000 ti: ffff880fc0b9a000 task.ti: ffff880fc0b9a000 RIP: 0010:[] [] rebind_subsystems+0x409/0x5f0 ... Call Trace: [] cgroup_kill_sb+0xff/0x210 [] deactivate_locked_super+0x4f/0x90 [] cgroup_mount+0x673/0x6e0 [] cpuset_mount+0xd9/0x110 [] mount_fs+0xb0/0x2d0 [] vfs_kern_mount+0xbd/0x180 [] do_new_mount+0x145/0x2c0 [] do_mount+0x356/0x3c0 [] SyS_mount+0xfd/0x140 [] tracesys+0xdd/0xe2 We still want rebind_subsystems() to take added/removed masks, so let's fix it by marking whether a cgroupfs_root has finished binding or not. Also, document what's going on around ->subsys_mask initialization so that similar mistakes aren't repeated. Signed-off-by: Tejun Heo Reported-by: Sasha Levin Acked-by: Li Zefan --- kernel/cgroup.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f9c99abc38ab..e801ecfa36ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1086,6 +1086,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } + /* + * Mark @root has finished binding subsystems. @root->subsys_mask + * now matches the bound subsystems. + */ + root->flags |= CGRP_ROOT_SUBSYS_BOUND; + return 0; } @@ -1485,6 +1491,14 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); + /* + * We need to set @root->subsys_mask now so that @root can be + * matched by cgroup_test_super() before it finishes + * initialization; otherwise, competing mounts with the same + * options may try to bind the same subsystems instead of waiting + * for the first one leading to unexpected mount errors. + * SUBSYS_BOUND will be set once actual binding is complete. + */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; ida_init(&root->cgroup_ida); @@ -1734,9 +1748,11 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { + ret = rebind_subsystems(root, 0, root->subsys_mask); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + } /* * Release all the links from cset_links to this hierarchy's -- cgit v1.2.3 From eb178d063324d9c30f673db3877b892a48ade21e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 18:05:21 -0700 Subject: cgroup: grab cgroup_mutex in drop_parsed_module_refcounts() This isn't strictly necessary as all subsystems specified in @subsys_mask are guaranteed to be pinned; however, it does spuriously trigger lockdep warning. Let's grab cgroup_mutex around it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e801ecfa36ef..2d3a132e881d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1325,11 +1325,11 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask) struct cgroup_subsys *ss; int i; - for_each_subsys(ss, i) { - if (!(subsys_mask & (1UL << i))) - continue; - module_put(cgroup_subsys[i]->module); - } + mutex_lock(&cgroup_mutex); + for_each_subsys(ss, i) + if (subsys_mask & (1UL << i)) + module_put(cgroup_subsys[i]->module); + mutex_unlock(&cgroup_mutex); } static int cgroup_remount(struct super_block *sb, int *flags, char *data) -- cgit v1.2.3 From a8ad805cfde00be8fe3b3dae8890996dbeb91e2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 21 Jun 2013 15:52:04 -0700 Subject: cgroup: fix RCU accesses around task->cgroups There are several places in kernel/cgroup.c where task->cgroups is accessed and modified without going through proper RCU accessors. None is broken as they're all lock protected accesses; however, this still triggers sparse RCU address space warnings. * Consistently use task_css_set() for task->cgroups dereferencing. * Use RCU_INIT_POINTER() to clear task->cgroups to &init_css_set on exit. * Remove unnecessary rcu_dereference_raw() from cset->subsys[] dereference in cgroup_exit(). Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by: Li Zefan --- kernel/cgroup.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2d3a132e881d..ee9f0c1c8bff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -724,7 +724,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * task can't change groups, so the only thing that can happen * is that it exits and its css is set back to init_css_set. */ - cset = task->cgroups; + cset = task_css_set(task); if (cset == &init_css_set) { res = &root->top_cgroup; } else { @@ -1971,7 +1971,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); - old_cset = tsk->cgroups; + old_cset = task_css_set(tsk); task_lock(tsk); rcu_assign_pointer(tsk->cgroups, new_cset); @@ -2094,8 +2094,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * we use find_css_set, which allocates a new one if necessary. */ for (i = 0; i < group_size; i++) { + struct css_set *old_cset; + tc = flex_array_get(group, i); - tc->cg = find_css_set(tc->task->cgroups, cgrp); + old_cset = task_css_set(tc->task); + tc->cg = find_css_set(old_cset, cgrp); if (!tc->cg) { retval = -ENOMEM; goto out_put_css_set_refs; @@ -3012,7 +3015,7 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. */ if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &p->cgroups->tasks); + list_add(&p->cg_list, &task_css_set(p)->tasks); task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -5061,8 +5064,8 @@ static const struct file_operations proc_cgroupstats_operations = { void cgroup_fork(struct task_struct *child) { task_lock(current); + get_css_set(task_css_set(current)); child->cgroups = current->cgroups; - get_css_set(child->cgroups); task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } @@ -5097,7 +5100,7 @@ void cgroup_post_fork(struct task_struct *child) write_lock(&css_set_lock); task_lock(child); if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &child->cgroups->tasks); + list_add(&child->cg_list, &task_css_set(child)->tasks); task_unlock(child); write_unlock(&css_set_lock); } @@ -5177,8 +5180,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) /* Reassign the task to the init_css_set. */ task_lock(tsk); - cset = tsk->cgroups; - tsk->cgroups = &init_css_set; + cset = task_css_set(tsk); + RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { /* @@ -5187,8 +5190,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = - rcu_dereference_raw(cset->subsys[i])->cgroup; + struct cgroup *old_cgrp = cset->subsys[i]->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); ss->exit(cgrp, old_cgrp, tsk); @@ -5555,7 +5557,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); + count = atomic_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } -- cgit v1.2.3 From a4ea1cc90604df08d471ae84eb9627319d10c844 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 21 Jun 2013 15:52:33 -0700 Subject: cgroup: always use RCU accessors for protected accesses kernel/cgroup.c still has places where a RCU pointer is set and accessed directly without going through RCU_INIT_POINTER() or rcu_dereference_protected(). They're all properly protected accesses so nothing is broken but it leads to spurious sparse RCU address space warnings. Substitute direct accesses with RCU_INIT_POINTER() and rcu_dereference_protected(). Note that %true is specified as the extra condition for all derference updates. This isn't ideal as all it does is suppressing warning without actually policing synchronization rules; however, most are scheduled to be removed pretty soon along with css_id itself, so no reason to be more elaborate. Combined with the previous changes, this removes all RCU related sparse warnings from cgroup. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by; Li Zefan --- kernel/cgroup.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ee9f0c1c8bff..4ed86773fff7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1427,7 +1427,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->name = &root_cgroup_name; + RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); } @@ -2558,7 +2558,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return ret; } - old_name = cgrp->name; + old_name = rcu_dereference_protected(cgrp->name, true); rcu_assign_pointer(cgrp->name, name); kfree_rcu(old_name, rcu_head); @@ -4177,13 +4177,15 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct css_id *id = rcu_dereference_protected(css->id, true); + /* * Update id->css pointer and make this css visible from * CSS ID functions. This pointer will be dereferened * from RCU-read-side without locks. */ - if (css->id) - rcu_assign_pointer(css->id->css, css); + if (id) + rcu_assign_pointer(id->css, css); } return 0; @@ -4863,7 +4865,7 @@ int __init cgroup_init_early(void) css_set_count = 1; init_cgroup_root(&cgroup_dummy_root); cgroup_root_count = 1; - init_task.cgroups = &init_css_set; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); init_cgrp_cset_link.cset = &init_css_set; init_cgrp_cset_link.cgrp = cgroup_dummy_top; @@ -5380,7 +5382,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { - struct css_id *id = css->id; + struct css_id *id = rcu_dereference_protected(css->id, true); + /* When this is called before css_id initialization, id can be NULL */ if (!id) return; @@ -5446,8 +5449,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return PTR_ERR(newid); newid->stack[0] = newid->id; - newid->css = rootcss; - rootcss->id = newid; + RCU_INIT_POINTER(newid->css, rootcss); + RCU_INIT_POINTER(rootcss->id, newid); return 0; } @@ -5461,7 +5464,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, subsys_id = ss->subsys_id; parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; - parent_id = parent_css->id; + parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); -- cgit v1.2.3 From 141965c7494d984b2bf24efd361a3125278869c6 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 26 Jun 2013 13:05:39 +0800 Subject: Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Remove CONFIG_FAIR_GROUP_SCHED that covers the runnable info, then we can use runnable load variables. Also remove 2 CONFIG_FAIR_GROUP_SCHED setting which is not in reverted patch(introduced in 9ee474f), but also need to revert. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51CA76A3.3050207@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +------ kernel/sched/fair.c | 17 ++++------------- kernel/sched/sched.h | 19 ++----------------- 3 files changed, 7 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ceeaf0f45be0..0241b1b55a04 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1611,12 +1611,7 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0ac2c3b56e1..36eadaaa4e5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1128,8 +1128,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3430,12 +3429,6 @@ unlock: return new_cpu; } -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3459,7 +3452,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5861,7 +5853,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP /* * Remove our load from contribution when we leave sched_fair * and ensure we don't carry in an old decay_count if we @@ -5920,7 +5912,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP atomic64_set(&cfs_rq->decay_counter, 1); atomic64_set(&cfs_rq->removed_load, 0); #endif @@ -6162,9 +6154,8 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif + .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 029601a61587..77ce668ba302 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -269,12 +269,6 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -284,9 +278,9 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_FAIR_GROUP_SCHED + /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; u64 tg_load_contrib; #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1027,17 +1021,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); -/* - * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg - * becomes useful in lb - */ -#if defined(CONFIG_FAIR_GROUP_SCHED) extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else -static inline void idle_enter_fair(struct rq *this_rq) {} -static inline void idle_exit_fair(struct rq *this_rq) {} -#endif #else /* CONFIG_SMP */ -- cgit v1.2.3 From fa6bddeb14d59d701f846b174b59c9982e926e66 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:46 +0800 Subject: sched: Move a few runnable tg variables into CONFIG_SMP The following 2 variables are only used under CONFIG_SMP, so its better to move their definiation into CONFIG_SMP too. atomic64_t load_avg; atomic_t runnable_avg; Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-3-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 77ce668ba302..31d25f80a7c6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -149,9 +149,11 @@ struct task_group { unsigned long shares; atomic_t load_weight; +#ifdef CONFIG_SMP atomic64_t load_avg; atomic_t runnable_avg; #endif +#endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; -- cgit v1.2.3 From a75cdaa915e42ef0e6f38dc7f2a6a1deca91d648 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:47 +0800 Subject: sched: Set an initial value of runnable avg for new forked task We need to initialize the se.avg.{decay_count, load_avg_contrib} for a new forked task. Otherwise random values of above variables cause a mess when a new task is enqueued: enqueue_task_fair enqueue_entity enqueue_entity_load_avg and make fork balancing imbalance due to incorrect load_avg_contrib. Further more, Morten Rasmussen notice some tasks were not launched at once after created. So Paul and Peter suggest giving a start value for new task runnable avg time same as sched_slice(). PeterZ said: > So the 'problem' is that our running avg is a 'floating' average; ie. it > decays with time. Now we have to guess about the future of our newly > spawned task -- something that is nigh impossible seeing these CPU > vendors keep refusing to implement the crystal ball instruction. > > So there's two asymptotic cases we want to deal well with; 1) the case > where the newly spawned program will be 'nearly' idle for its lifetime; > and 2) the case where its cpu-bound. > > Since we have to guess, we'll go for worst case and assume its > cpu-bound; now we don't want to make the avg so heavy adjusting to the > near-idle case takes forever. We want to be able to quickly adjust and > lower our running avg. > > Now we also don't want to make our avg too light, such that it gets > decremented just for the new task not having had a chance to run yet -- > even if when it would run, it would be more cpu-bound than not. > > So what we do is we make the initial avg of the same duration as that we > guess it takes to run each task on the system at least once -- aka > sched_slice(). > > Of course we can defeat this with wakeup/fork bombs, but in the 'normal' > case it should be good enough. Paul also contributed most of the code comments in this commit. Signed-off-by: Alex Shi Reviewed-by: Gu Zheng Reviewed-by: Paul Turner [peterz; added explanation of sched_slice() usage] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-4-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++---- kernel/sched/fair.c | 24 ++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0241b1b55a04..729e7fc7634b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1611,10 +1611,6 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP - p->se.avg.runnable_avg_period = 0; - p->se.avg.runnable_avg_sum = 0; -#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -1758,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif + /* Initialize new task's runnable average */ + init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36eadaaa4e5b..e1602a0fdbf8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -680,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +#ifdef CONFIG_SMP +static inline void __update_task_entity_contrib(struct sched_entity *se); + +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ + u32 slice; + + p->se.avg.decay_count = 0; + slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; + p->se.avg.runnable_avg_sum = slice; + p->se.avg.runnable_avg_period = slice; + __update_task_entity_contrib(&p->se); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ +} +#endif + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1527,6 +1547,10 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * We track migrations using entity decay_count <= 0, on a wake-up * migration we use a negative decay count to track the remote decays * accumulated while sleeping. + * + * Newly forked tasks are enqueued with se->avg.decay_count == 0, they + * are seen by enqueue_entity_load_avg() as a migration with an already + * constructed load_avg_contrib. */ if (unlikely(se->avg.decay_count <= 0)) { se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31d25f80a7c6..9c65d46504b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1048,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); +extern void init_task_runnable_average(struct task_struct *p); + #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { -- cgit v1.2.3 From 282cf499f03ec1754b6c8c945c9674b02631fb0f Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:48 +0800 Subject: sched: Fix sleep time double accounting in enqueue entity The woken migrated task will __synchronize_entity_decay(se); in migrate_task_rq_fair, then it needs to set `se->avg.last_runnable_update -= (-se->avg.decay_count) << 20' before update_entity_load_avg, in order to avoid sleep time is updated twice for se.avg.load_avg_contrib in both __syncchronize and update_entity_load_avg. However if the sleeping task is woken up from the same cpu, it miss the last_runnable_update before update_entity_load_avg(se, 0, 1), then the sleep time was used twice in both functions. So we need to remove the double sleep time accounting. Paul also contributed some code comments in this commit. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-5-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1602a0fdbf8..9bbc303598ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1571,7 +1571,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } wakeup = 0; } else { - __synchronize_entity_decay(se); + /* + * Task re-woke on same cpu (or else migrate_task_rq_fair() + * would have made count negative); we must be careful to avoid + * double-accounting blocked time after synchronizing decays. + */ + se->avg.last_runnable_update += __synchronize_entity_decay(se) + << 20; } /* migrated tasks did not contribute to our blocked load */ -- cgit v1.2.3 From 83dfd5235ebd66c284b97befe6eabff7132333e6 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:49 +0800 Subject: sched: Update cpu load after task_tick To get the latest runnable info, we need do this cpuload update after task_tick. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-6-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 729e7fc7634b..08746cc12370 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2165,8 +2165,8 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); -- cgit v1.2.3 From b92486cbf2aa230d00f160664858495c81d2b37b Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:50 +0800 Subject: sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task They are the base values in load balance, update them with rq runnable load average, then the load balance will consider runnable load avg naturally. We also try to include the blocked_load_avg as cpu load in balancing, but that cause kbuild performance drop 6% on every Intel machine, and aim7/oltp drop on some of 4 CPU sockets machines. Or only add blocked_load_avg into get_rq_runable_load, hackbench still drop a little on NHM EX. Signed-off-by: Alex Shi Reviewed-by: Gu Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-7-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- kernel/sched/proc.c | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bbc303598ea..e6d82cae4910 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2963,7 +2963,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -3008,9 +3008,10 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) - return rq->load.weight / nr_running; + return load_avg / nr_running; return 0; } diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index bb3a6a0b8623..ce5cd4892e43 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -501,6 +501,18 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_SMP +unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->cfs.runnable_load_avg; +} +#else +unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->load.weight; +} +#endif + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -522,7 +534,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; + unsigned long load = get_rq_runnable_load(this_rq); unsigned long pending_updates; /* @@ -568,11 +580,12 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { + unsigned long load = get_rq_runnable_load(this_rq); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); + __update_cpu_load(this_rq, load, 1); calc_load_account_active(this_rq); } -- cgit v1.2.3 From a003a25b227d59ded9197ced109517f037d01c27 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:51 +0800 Subject: sched: Consider runnable load average in move_tasks() Aside from using runnable load average in background, move_tasks is also the key function in load balance. We need consider the runnable load average in it in order to make it an apple to apple load comparison. Morten had caught a div u64 bug on ARM, thanks! Thanks-to: Morten Rasmussen Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-8-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6d82cae4910..7948bb825985 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4179,11 +4179,14 @@ static int tg_load_down(struct task_group *tg, void *data) long cpu = (long)data; if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; + load = cpu_rq(cpu)->avg.load_avg_contrib; } else { + unsigned long tmp_rla; + tmp_rla = tg->parent->cfs_rq[cpu]->runnable_load_avg + 1; + load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + load *= tg->se[cpu]->avg.load_avg_contrib; + load /= tmp_rla; } tg->cfs_rq[cpu]->h_load = load; @@ -4209,12 +4212,9 @@ static void update_h_load(long cpu) static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - unsigned long load; - - load = p->se.load.weight; - load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return load; + return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, + cfs_rq->runnable_load_avg + 1); } #else static inline void update_blocked_averages(int cpu) @@ -4227,7 +4227,7 @@ static inline void update_h_load(long cpu) static unsigned long task_h_load(struct task_struct *p) { - return p->se.load.weight; + return p->se.avg.load_avg_contrib; } #endif -- cgit v1.2.3 From 72a4cf20cb71a327c636c7042fdacc25abffc87c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:53 +0800 Subject: sched: Change cfs_rq load avg to unsigned long Since the 'u64 runnable_load_avg, blocked_load_avg' in cfs_rq struct are smaller than 'unsigned long' cfs_rq->load.weight. We don't need u64 vaiables to describe them. unsigned long is more efficient and convenience. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-10-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 4 ++-- kernel/sched/fair.c | 7 ++----- kernel/sched/sched.h | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a673520..160afdc5cdff 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -211,9 +211,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7948bb825985..f19772de1b1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4181,12 +4181,9 @@ static int tg_load_down(struct task_group *tg, void *data) if (!tg->parent) { load = cpu_rq(cpu)->avg.load_avg_contrib; } else { - unsigned long tmp_rla; - tmp_rla = tg->parent->cfs_rq[cpu]->runnable_load_avg + 1; - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->avg.load_avg_contrib; - load /= tmp_rla; + load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, + tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); } tg->cfs_rq[cpu]->h_load = load; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9c65d46504b1..9eb12d9edd35 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -277,7 +277,7 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; -- cgit v1.2.3 From bf5b986ed4d20428eeec3df4a03dbfebb9b6538c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:54 +0800 Subject: sched/tg: Use 'unsigned long' for load variable in task group Since tg->load_avg is smaller than tg->load_weight, we don't need a atomic64_t variable for load_avg in 32 bit machine. The same reason for cfs_rq->tg_load_contrib. The atomic_long_t/unsigned long variable type are more efficient and convenience for them. Signed-off-by: Alex Shi Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-11-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 +++--- kernel/sched/fair.c | 12 ++++++------ kernel/sched/sched.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 160afdc5cdff..d803989defc0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -215,9 +215,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", - (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", cfs_rq->tg_runnable_contrib); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f19772de1b1c..30ccc37112d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1075,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic64_read(&tg->load_avg); + tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; @@ -1356,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) { struct task_group *tg = cfs_rq->tg; - s64 tg_contrib; + long tg_contrib; tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; - if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic64_add(tg_contrib, &tg->load_avg); + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; } } @@ -1397,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) u64 contrib; contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div64_u64(contrib, - atomic64_read(&tg->load_avg) + 1); + se->avg.load_avg_contrib = div_u64(contrib, + atomic_long_read(&tg->load_avg) + 1); /* * For group entities we need to compute a correction term in the case diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9eb12d9edd35..5585eb25e9a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -150,7 +150,7 @@ struct task_group { atomic_t load_weight; #ifdef CONFIG_SMP - atomic64_t load_avg; + atomic_long_t load_avg; atomic_t runnable_avg; #endif #endif @@ -284,7 +284,7 @@ struct cfs_rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; - u64 tg_load_contrib; + unsigned long tg_load_contrib; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* -- cgit v1.2.3 From 2509940fd71c2e2915a05052bbdbf2d478364184 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:55 +0800 Subject: sched/cfs_rq: Change atomic64_t removed_load to atomic_long_t Similar to runnable_load_avg, blocked_load_avg variable, long type is enough for removed_load in 64 bit or 32 bit machine. Then we avoid the expensive atomic64 operations on 32 bit machine. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-12-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++---- kernel/sched/sched.h | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 30ccc37112d0..b43474a964c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1517,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) if (!decays && !force_update) return; - if (atomic64_read(&cfs_rq->removed_load)) { - u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + if (atomic_long_read(&cfs_rq->removed_load)) { + unsigned long removed_load; + removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); subtract_blocked_load_contrib(cfs_rq, removed_load); } @@ -3480,7 +3481,8 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) */ if (se->avg.decay_count) { se->avg.decay_count = -__synchronize_entity_decay(se); - atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + atomic_long_add(se->avg.load_avg_contrib, + &cfs_rq->removed_load); } } #endif /* CONFIG_SMP */ @@ -5942,7 +5944,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif #ifdef CONFIG_SMP atomic64_set(&cfs_rq->decay_counter, 1); - atomic64_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load, 0); #endif } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5585eb25e9a3..705991906fbe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,8 +278,9 @@ struct cfs_rq { * the FAIR_GROUP_SCHED case). */ unsigned long runnable_load_avg, blocked_load_avg; - atomic64_t decay_counter, removed_load; + atomic64_t decay_counter; u64 last_decay; + atomic_long_t removed_load; #ifdef CONFIG_FAIR_GROUP_SCHED /* Required to track per-cpu representation of a task_group */ -- cgit v1.2.3 From a9cef46a10cc1b84bf2cdf4060766d858c0439d8 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:56 +0800 Subject: sched/tg: Remove tg.load_weight Since no one use it. Signed-off-by: Alex Shi Reviewed-by: Paul Turner Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-13-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 705991906fbe..ef0a7b2439dd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -148,7 +148,6 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; - atomic_t load_weight; #ifdef CONFIG_SMP atomic_long_t load_avg; atomic_t runnable_avg; -- cgit v1.2.3 From a9dc5d0e33c677619e4b97a38c23db1a42857905 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 20 Jun 2013 10:18:57 +0800 Subject: sched: Change get_rq_runnable_load() to static and inline Based-on-patch-by: Fengguang Wu Signed-off-by: Alex Shi Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1371694737-29336-14-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index ce5cd4892e43..16f5a30f9c88 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -502,12 +502,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, } #ifdef CONFIG_SMP -unsigned long get_rq_runnable_load(struct rq *rq) +static inline unsigned long get_rq_runnable_load(struct rq *rq) { return rq->cfs.runnable_load_avg; } #else -unsigned long get_rq_runnable_load(struct rq *rq) +static inline unsigned long get_rq_runnable_load(struct rq *rq) { return rq->load.weight; } -- cgit v1.2.3 From 939fd731eb88a0cdd9058d0b0143563172a217d7 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 25 Jun 2013 13:33:36 +0530 Subject: sched/debug: Add load-tracking statistics to task At present we print per-entity load-tracking statistics for cfs_rq of cgroups/runqueues. Given that per task statistics is maintained, it can be used to know the contribution made by the task to its parenting cfs_rq level. This patch adds per-task load-tracking statistics to /proc//sched. Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130625080336.GA20175@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d803989defc0..626320985366 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -566,6 +566,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); +#endif P(policy); P(prio); #undef PN -- cgit v1.2.3 From 0fc576d592bd137437fdeb059738b789e642b744 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 27 Jun 2013 11:24:18 +0530 Subject: sched/fair: Fix typo describing flags in enqueue_entity Fix spelling of 'calling' in description of se flags in enqueue_entity(). Signed-off-by: Kamalesh Babulal Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130627055418.GA18582@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b43474a964c2..f77f9c527449 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1760,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). + * through calling update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; -- cgit v1.2.3 From 9dceefe483d7640ba0bbf3e53d1db880e7469aba Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 26 Jun 2013 16:27:35 -0600 Subject: PM / Sleep: Warn about system time after resume with pm_trace pm_trace uses the system's Real Time Clock (RTC) to save the magic number. The reason for this is that the RTC is the only reliably available piece of hardware during resume operations where a value can be set that will survive a reboot. Consequence is that after a resume (even if it is successful) your system clock will have a value corresponding to the magic number instead of the correct date/time! It is therefore advisable to use a program like ntp-date or rdate to reset the correct date/time from an external time source when using this trace option. There is no run-time message to warn users of the consequences of enabling pm_trace. Adding a warning message to pm_trace_store() will serve as a reminder to users to set the system date and time after resume. Signed-off-by: Shuah Khan Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0828070d38b4..1d1bf630e6e9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -530,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; + if (pm_trace_enabled) { + pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" + "PM: Correct system time has to be restored manually after resume.\n"); + } return n; } return -EINVAL; -- cgit v1.2.3 From e2bd416f6246d11be29999c177d2534943a5c2df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2013 19:37:23 -0700 Subject: cgroup: fix deadlock on cgroup_mutex via drop_parsed_module_refcounts() eb178d06332 ("cgroup: grab cgroup_mutex in drop_parsed_module_refcounts()") made drop_parsed_module_refcounts() grab cgroup_mutex to make lockdep assertion in for_each_subsys() happy. Unfortunately, cgroup_remount() calls the function while holding cgroup_mutex in its failure path leading to the following deadlock. # mount -t cgroup -o remount,memory,blkio cgroup blkio cgroup: option changes via remount are deprecated (pid=525 comm=mount) ============================================= [ INFO: possible recursive locking detected ] 3.10.0-rc4-work+ #1 Not tainted --------------------------------------------- mount/525 is trying to acquire lock: (cgroup_mutex){+.+.+.}, at: [] drop_parsed_module_refcounts+0x21/0xb0 but task is already holding lock: (cgroup_mutex){+.+.+.}, at: [] cgroup_remount+0x51/0x200 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cgroup_mutex); lock(cgroup_mutex); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by mount/525: #0: (&type->s_umount_key#30){+.+...}, at: [] do_mount+0x2bd/0xa30 #1: (&sb->s_type->i_mutex_key#9){+.+.+.}, at: [] cgroup_remount+0x43/0x200 #2: (cgroup_mutex){+.+.+.}, at: [] cgroup_remount+0x51/0x200 #3: (cgroup_root_mutex){+.+.+.}, at: [] cgroup_remount+0x5f/0x200 stack backtrace: CPU: 2 PID: 525 Comm: mount Not tainted 3.10.0-rc4-work+ #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ffffffff829651f0 ffff88000ec2fc28 ffffffff81c24bb1 ffff88000ec2fce8 ffffffff810f420d 0000000000000006 0000000000000001 0000000000000056 ffff8800153b4640 ffff880000000000 ffffffff81c2e468 ffff8800153b4640 Call Trace: [] dump_stack+0x19/0x1b [] __lock_acquire+0x15dd/0x1e60 [] lock_acquire+0x9c/0x1f0 [] mutex_lock_nested+0x65/0x410 [] drop_parsed_module_refcounts+0x21/0xb0 [] cgroup_remount+0x1ae/0x200 [] do_remount_sb+0x82/0x190 [] do_mount+0x5f1/0xa30 [] SyS_mount+0x83/0xc0 [] system_call_fastpath+0x16/0x1b Fix it by moving the drop_parsed_module_refcounts() invocation outside cgroup_mutex. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ed86773fff7..1b7b567208cd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1365,7 +1365,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } @@ -1380,7 +1379,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) { /* rebind_subsystems failed, re-populate the removed files */ cgroup_populate_dir(cgrp, false, removed_mask); - drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } @@ -1395,6 +1393,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + if (ret) + drop_parsed_module_refcounts(opts.subsys_mask); return ret; } -- cgit v1.2.3 From 0ce6cba35777cf96a54ce0d5856dc962566b8717 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2013 19:37:26 -0700 Subject: cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options 1672d04070 ("cgroup: fix cgroupfs_root early destruction path") introduced CGRP_ROOT_SUBSYS_BOUND which is used to mark completion of subsys binding on a new root; however, this broke remounts. cgroup_remount() doesn't allow changing root options via remount and CGRP_ROOT_SUBSYS_BOUND, which is set on all fully initialized roots, makes the function reject all remounts. Fix it by putting the options part in the lower 16 bits of root->flags and masking the comparions. While at it, make cgroup_remount() emit an error message explaining why it's rejecting a remount request, so that it's less of a mystery. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1b7b567208cd..5a2fcf5bcc4a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1362,8 +1362,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (opts.flags != root->flags || + if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { + pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", + root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; goto out_unlock; } -- cgit v1.2.3 From add332a1523a09cf6d429933f1e2fb4ccdfe6479 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 27 Jun 2013 22:20:05 +0530 Subject: sched/debug: Fix formatting of /proc//sched This patch alters format string's width, to align all statistics at par with the longest struct sched_statistic member name under /proc//sched. Signed-off-by: Kamalesh Babulal Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/20130627165005.GA15583@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 626320985366..159561415d13 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -493,15 +493,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, get_nr_threads(p)); SEQ_printf(m, - "---------------------------------------------------------\n"); + "---------------------------------------------------------" + "----------\n"); #define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) #define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) PN(se.exec_start); PN(se.vruntime); @@ -560,9 +561,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) } #endif __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); @@ -585,7 +586,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } } -- cgit v1.2.3 From 2779db8d37d4b542d9ca2575f5f178dbeaca6c86 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 28 Jun 2013 02:40:30 +0100 Subject: genirq: Fix can_request_irq() for IRQs without an action Commit 02725e7471b8 ('genirq: Use irq_get/put functions'), inadvertently changed can_request_irq() to return 0 for IRQs that have no action. This causes pcibios_lookup_irq() to select only IRQs that already have an action with IRQF_SHARED set, or to fail if there are none. Change can_request_irq() to return 1 for IRQs that have no action (if the first two conditions are met). Reported-by: Bjarni Ingi Gislason Tested-by: Bjarni Ingi Gislason (against 3.2) Signed-off-by: Ben Hutchings Cc: 709647@bugs.debian.org Cc: stable@vger.kernel.org # 2.6.39+ Link: http://bugs.debian.org/709647 Link: http://lkml.kernel.org/r/1372383630.23847.40.camel@deadeye.wl.decadent.org.uk Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e16caa81f887..514bcfd855a8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return 0; if (irq_settings_can_request(desc)) { - if (desc->action) - if (irqflags & desc->action->flags & IRQF_SHARED) - canrequest =1; + if (!desc->action || + irqflags & desc->action->flags & IRQF_SHARED) + canrequest = 1; } irq_put_desc_unlock(desc, flags); return canrequest; -- cgit v1.2.3 From d55f0cc4c9a70e3105f1e813ab5f221a65ac2ec3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 28 Jun 2013 00:23:09 -0300 Subject: genirq: generic-chip: Export some irq_gc_ functions When building imx_v6_v7_defconfig with imx-drm drivers selected as modules, we get the following build errors: ERROR: "irq_gc_mask_clr_bit" [drivers/staging/imx-drm/ipu-v3/imx-ipu-v3.ko] undefined! ERROR: "irq_gc_mask_set_bit" [drivers/staging/imx-drm/ipu-v3/imx-ipu-v3.ko] undefined! ERROR: "irq_gc_ack_set_bit" [drivers/staging/imx-drm/ipu-v3/imx-ipu-v3.ko] undefined! Export the required functions to avoid this problem. Signed-off-by: Fabio Estevam Cc: shawn.guo@linaro.org Cc: kernel@pengutronix.de Link: http://lkml.kernel.org/r/1372389789-7048-1-git-send-email-festevam@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index a746a8f54dae..76ea748324f5 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -62,6 +62,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); /** * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register @@ -81,6 +82,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); /** * irq_gc_unmask_enable_reg - Unmask chip via enable register @@ -115,6 +117,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); /** * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit -- cgit v1.2.3 From ccc414f83914178c7ab04ac4d4f0331fe4c37231 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jun 2013 11:45:15 +0200 Subject: genirq: Add the generic chip to the genirq docbook Signed-off-by: Thomas Gleixner Cc: Randy Dunlap --- kernel/irq/generic-chip.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 76ea748324f5..1c39eccc1eaf 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -45,7 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) } /** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * irq_gc_mask_set_bit - Mask chip via setting bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -65,7 +65,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); /** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -167,7 +167,8 @@ void irq_gc_eoi(struct irq_data *d) /** * irq_gc_set_wake - Set/clr wake bit for an interrupt - * @d: irq_data + * @d: irq_data + * @on: Indicates whether the wake bit should be set or cleared * * For chips where the wake from suspend functionality is not * configured in a separate register and the wakeup active state is @@ -339,7 +340,7 @@ EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); */ static struct lock_class_key irq_nested_lock_class; -/** +/* * irq_map_generic_chip - Map a generic chip for an irq domain */ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, @@ -454,7 +455,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip); /** * irq_setup_alt_chip - Switch to alternative chip * @d: irq_data for this interrupt - * @type Flow type to be initialized + * @type: Flow type to be initialized * * Only to be called from chip->irq_set_type() callbacks. */ -- cgit v1.2.3 From 333bb864f192015a53b5060b829089decd0220ef Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 28 Jun 2013 19:10:35 +0800 Subject: sched/debug: Remove CONFIG_FAIR_GROUP_SCHED mask Now that we are using runnable load avg in sched balance, we don't need to keep it under CONFIG_FAIR_GROUP_SCHED. Also align the code style to #ifdef instead of #if defined() and reorder the tg output info. Signed-off-by: Alex Shi Cc: pjt@google.com Cc: kamalesh@linux.vnet.ibm.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1372417835-4698-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 159561415d13..e076bddd4c66 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", - atomic_long_read(&cfs_rq->tg->load_avg)); +#ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", atomic_read(&cfs_rq->tg->runnable_avg)); #endif +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -567,7 +569,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); P(se.avg.runnable_avg_period); P(se.avg.load_avg_contrib); -- cgit v1.2.3 From d2e08473f2488d53a71c2f53455f934ec6c44c53 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 11:46:09 -0700 Subject: softirq: Use _RET_IP_ Use the already defined macro to pass the function return address. Signed-off-by: Davidlohr Bueso Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1367347569.1784.3.camel@buesod1.americas.hpqcorp.net Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..a5f88362589b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(_RET_IP_); sub_preempt_count(cnt); } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) void local_bh_enable(void) { - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + _local_bh_enable_ip(_RET_IP_); } EXPORT_SYMBOL(local_bh_enable); @@ -223,8 +222,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); -- cgit v1.2.3 From 9e04d3804d3ac97d8c03a41d78d0f0674b5d01e1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 21 May 2013 20:43:50 +0200 Subject: timer: Fix jiffies wrap behavior of round_jiffies_common() Direct compare of jiffies related values does not work in the wrap around case. Replace it with time_is_after_jiffies(). Signed-off-by: Bart Van Assche Cc: Arjan van de Ven Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/519BC066.5080600@acm.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 15ffdb3f1948..15bc1b41021d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; } /** -- cgit v1.2.3 From 7c4c3a0f18ba57ea2a2985034532303d2929902a Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:44 +0100 Subject: hrtimers: Support resuming with two or more CPUs online (but stopped) hrtimers_resume() only reprograms the timers for the current CPU as it assumes that all other CPUs are offline at this point in the resume process. If other CPUs are online then their timers will not be corrected and they may fire at the wrong time. When running as a Xen guest, this assumption is not true. Non-boot CPUs are only stopped with IRQs disabled instead of offlining them. This is a performance optimization as disabling the CPUs would add an unacceptable amount of additional downtime during a live migration (> 200 ms for a 4 VCPU guest). hrtimers_resume() cannot call on_each_cpu(retrigger_next_event,...) as the other CPUs will be stopped with IRQs disabled. Instead, defer the call to the next softirq. [ tglx: Separated the xen change out ] Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-2-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..e86827e94c9a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -773,15 +773,24 @@ void clock_was_set(void) /* * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred to the softirq. + * + * The one-shot timer has already been programmed to fire immediately + * (see tick_resume_oneshot()) and this interrupt will trigger the + * softirq to run early enough to correctly reprogram the timers on + * all CPUs. */ void hrtimers_resume(void) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - retrigger_next_event(NULL); - timerfd_clock_was_set(); + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); } static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) -- cgit v1.2.3 From 04397fe94ad65289884b9862b6a0c722ececaadf Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:45 +0100 Subject: timekeeping: Pass flags instead of multiple bools to timekeeping_update() Instead of passing multiple bools to timekeeping_updated(), define flags and use a single 'action' parameter. It is then more obvious what each timekeeping_update() call does. Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-3-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 838fc0777b68..d8b23a929e66 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -27,6 +27,9 @@ #include "ntp_internal.h" #include "timekeeping_internal.h" +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) + static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static seqcount_t timekeeper_seq; @@ -242,16 +245,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) +static void timekeeping_update(struct timekeeper *tk, unsigned int action) { - if (clearntp) { + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } update_vsyscall(tk); update_pvclock_gtod(tk); - if (mirror) + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } @@ -509,7 +512,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -553,7 +556,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -643,7 +646,7 @@ static int change_clocksource(void *data) module_put(new->owner); } } - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -884,7 +887,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -966,7 +969,7 @@ static void timekeeping_resume(void) tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false, true); + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1419,7 +1422,7 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, false, false); + timekeeping_update(real_tk, 0); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); -- cgit v1.2.3 From 780427f0e113b4c77dfff4d258c05a902cdb0eb9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 27 Jun 2013 11:35:46 +0100 Subject: timekeeping: Indicate that clock was set in the pvclock gtod notifier If the clock was set (stepped), set the action parameter to functions in the pvclock gtod notifier chain to non-zero. This allows the callee to only do work if the clock was stepped. This will be used on Xen as the synchronization of the Xen wallclock to the control domain's (dom0) system time will be done with this notifier and updating on every timer tick is unnecessary and too expensive. Signed-off-by: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: John Stultz Cc: Link: http://lkml.kernel.org/r/1372329348-20841-4-git-send-email-david.vrabel@citrix.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d8b23a929e66..846d0a1f235e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -29,6 +29,7 @@ #define TK_CLEAR_NTP (1 << 0) #define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -204,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); -static void update_pvclock_gtod(struct timekeeper *tk) +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { - raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** @@ -220,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, true); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; @@ -252,7 +253,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } update_vsyscall(tk); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); @@ -512,7 +513,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -556,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -646,7 +647,7 @@ static int change_clocksource(void *data) module_put(new->owner); } } - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -887,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -969,7 +970,7 @@ static void timekeeping_resume(void) tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1243,9 +1244,10 @@ out_adjust: * It also calls into the NTP code to handle leapsecond processing. * */ -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + unsigned int action = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1268,8 +1270,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_was_set_delayed(); + action = TK_CLOCK_WAS_SET; } } + return action; } /** @@ -1354,6 +1358,7 @@ static void update_wall_time(void) struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; + unsigned int action; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1406,7 +1411,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(tk); + action = accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1422,7 +1427,7 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, 0); + timekeeping_update(real_tk, action); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1684,6 +1689,7 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); + update_pvclock_gtod(tk, true); clock_was_set_delayed(); } write_seqcount_end(&timekeeper_seq); -- cgit v1.2.3 From c7ba8287cd11f2fc9e2feee9e1fac34b7293658f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 29 Jun 2013 14:06:10 -0700 Subject: cgroup: CGRP_ROOT_SUBSYS_BOUND should also be ignored when mounting an existing hierarchy 0ce6cba357 ("cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options") only updated the remount path but CGRP_ROOT_SUBSYS_BOUND should also be ignored when comparing options while mounting an existing hierarchy. As option mismatch triggers a warning but doesn't fail the mount without sane_behavior, this only triggers a spurious warning message. Fix it by only comparing CGRP_ROOT_OPTION_MASK bits when comparing new and existing root options. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a2fcf5bcc4a..e5583d10a325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1703,7 +1703,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_free_root(opts.new_root); - if (root->flags != opts.flags) { + if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; -- cgit v1.2.3 From 6e94a780374ed31b280f939d4757e8d7858dff16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jun 2013 10:58:31 -0400 Subject: tracing: Failed to create system directory Running the following: # cd /sys/kernel/debug/tracing # echo p:i do_sys_open > kprobe_events # echo p:j schedule >> kprobe_events # cat kprobe_events p:kprobes/i do_sys_open p:kprobes/j schedule # echo p:i do_sys_open >> kprobe_events # cat kprobe_events p:kprobes/j schedule p:kprobes/i do_sys_open # ls /sys/kernel/debug/tracing/events/kprobes/ enable filter j Notice that the 'i' is missing from the kprobes directory. The console produces: "Failed to create system directory kprobes" This is because kprobes passes in a allocated name for the system and the ftrace event subsystem saves off that name instead of creating a duplicate for it. But the kprobes may free the system name making the pointer to it invalid. This bug was introduced by 92edca073c37 "tracing: Use direct field, type and system names" which switched from using kstrdup() on the system name in favor of just keeping apointer to it, as the internal ftrace event system names are static and exist for the life of the computer being booted. Instead of reverting back to duplicating system names again, we can use core_kernel_data() to determine if the passed in name was allocated or static. Then use the MSB of the ref_count to be a flag to keep track if the name was allocated or not. Then we can still save from having to duplicate strings that will always exist, but still copy the ones that may be freed. Cc: stable@vger.kernel.org # 3.10 Reported-by: "zhangwei(Jovi)" Reported-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f57b01574a30..903a0bf2685e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; +#define SYSTEM_FL_FREE_NAME (1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} + /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ @@ -344,8 +361,8 @@ static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; - WARN_ON_ONCE(system->ref_count == 0); - if (--system->ref_count) + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) return; list_del(&system->list); @@ -354,13 +371,15 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { - WARN_ON_ONCE(system->ref_count == 0); - system->ref_count++; + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); } static void __get_system_dir(struct ftrace_subsystem_dir *dir) @@ -374,7 +393,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ - WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) @@ -1274,7 +1293,15 @@ create_new_subsystem(const char *name) return NULL; system->ref_count = 1; - system->name = name; + + /* Only allocate if dynamic (kprobes and modules) */ + if (!core_kernel_data((unsigned long)name)) { + system->ref_count |= SYSTEM_FL_FREE_NAME; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) + goto out_free; + } else + system->name = name; system->filter = NULL; @@ -1287,6 +1314,8 @@ create_new_subsystem(const char *name) return system; out_free: + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); return NULL; } -- cgit v1.2.3 From 288e984e622336bab8bc3dfdf2f190816362d9a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:06 +0200 Subject: tracing/kprobes: Avoid perf_trace_buf_*() if ->perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit() make no sense if this task/CPU has no active counters. Change kprobe_perf_func() and kretprobe_perf_func() to check call->perf_events beforehand and return if this list is empty. For example, "perf record -e some_probe -p1". Only /sbin/init will report, all other threads which hit the same probe will do perf_trace_buf_prepare/perf_trace_buf_submit just to realize that nobody wants perf_swevent_event(). Link: http://lkml.kernel.org/r/20130620173806.GA13151@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f2374172ba7b..c35bebe53ffe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1156,6 +1156,10 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); @@ -1171,8 +1175,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); } @@ -1188,6 +1190,10 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); @@ -1203,8 +1209,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head, NULL); } -- cgit v1.2.3 From 3fe3d6193e7cd7b4dd2bde10772f048bdefea4ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:09 +0200 Subject: tracing/kprobes: Kill probe_enable_lock enable_trace_probe() and disable_trace_probe() should not worry about serialization, the caller (perf_trace_init or __ftrace_set_clr_event) holds event_mutex. They are also called by kprobe_trace_self_tests_init(), but this __init function can't race with itself or trace_events.c And note that this code depended on event_mutex even before 41a7dd420c which introduced probe_enable_lock. In fact it assumes that the caller kprobe_register() can never race with itself. Otherwise, say, tp->flags manipulations are racy. Link: http://lkml.kernel.org/r/20130620173809.GA13158@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c35bebe53ffe..282f86cfd304 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -183,16 +183,15 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } +/* + * This and enable_trace_probe/disable_trace_probe rely on event_mutex + * held by the caller, __ftrace_set_clr_event(). + */ static int trace_probe_nr_files(struct trace_probe *tp) { - struct ftrace_event_file **file; + struct ftrace_event_file **file = rcu_dereference_raw(tp->files); int ret = 0; - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - file = rcu_dereference_raw(tp->files); if (file) while (*(file++)) ret++; @@ -200,8 +199,6 @@ static int trace_probe_nr_files(struct trace_probe *tp) return ret; } -static DEFINE_MUTEX(probe_enable_lock); - /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -211,8 +208,6 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); @@ -223,7 +218,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) GFP_KERNEL); if (!new) { ret = -ENOMEM; - goto out_unlock; + goto out; } memcpy(new, old, n * sizeof(struct ftrace_event_file *)); new[n] = file; @@ -246,10 +241,7 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) else ret = enable_kprobe(&tp->rp.kp); } - - out_unlock: - mutex_unlock(&probe_enable_lock); - + out: return ret; } @@ -282,8 +274,6 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { struct ftrace_event_file **new, **old; int n = trace_probe_nr_files(tp); @@ -292,7 +282,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) old = rcu_dereference_raw(tp->files); if (n == 0 || trace_probe_file_index(tp, file) < 0) { ret = -EINVAL; - goto out_unlock; + goto out; } if (n == 1) { /* Remove the last file */ @@ -303,7 +293,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) GFP_KERNEL); if (!new) { ret = -ENOMEM; - goto out_unlock; + goto out; } /* This copy & check loop copies the NULL stopper too */ @@ -326,10 +316,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) else disable_kprobe(&tp->rp.kp); } - - out_unlock: - mutex_unlock(&probe_enable_lock); - + out: return ret; } @@ -1214,6 +1201,12 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, } #endif /* CONFIG_PERF_EVENTS */ +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) @@ -1379,6 +1372,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) return NULL; } +/* + * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * stage, we can do this lockless. + */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; -- cgit v1.2.3 From a439059610ecd257dba29a612729132e470d118f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 29 Jun 2013 00:08:04 -0500 Subject: tracing: Simplify code for showing of soft disabled flag Rather than enumerating each permutation, build the enable state string up from the combination of states. This also allows for the simpler addition of more states. Link: http://lkml.kernel.org/r/9aff5af6dee2f5a40ca30df41c39d5f33e998d7a.1372479499.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 903a0bf2685e..7ee08b95c384 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -638,17 +638,17 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_file *file = filp->private_data; - char *buf; + char buf[4] = "0"; - if (file->flags & FTRACE_EVENT_FL_ENABLED) { - if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) - buf = "0*\n"; - else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) - buf = "1*\n"; - else - buf = "1\n"; - } else - buf = "0\n"; + if (file->flags & FTRACE_EVENT_FL_ENABLED && + !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || + file->flags & FTRACE_EVENT_FL_SOFT_MODE) + strcat(buf, "*"); + + strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } -- cgit v1.2.3 From 3baa5e4cf224b8a55220cc841bb475e164b84ceb Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 29 Jun 2013 00:08:07 -0500 Subject: tracing: Fix disabling of soft disable The comment on the soft disable 'disable' case of __ftrace_event_enable_disable() states that the soft disable bit should be cleared in that case, but currently only the soft mode bit is actually cleared. This essentially leaves the standard non-soft-enable enable/disable paths as the only way to clear the soft disable flag, but the soft disable bit should also be cleared when removing a trigger with '!'. Also, the SOFT_DISABLED bit should never be set if SOFT_MODE is cleared. This fixes the above discrepancies. Link: http://lkml.kernel.org/r/b9c68dd50bc07019e6c67d3f9b29be4ef1b2badb.1372479499.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7ee08b95c384..5892470bc2ee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -291,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, } call->class->reg(call, TRACE_REG_UNREGISTER, file); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* -- cgit v1.2.3 From b04d52e368e2cf526abb2bab61f304eaea126af2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:14 +0200 Subject: tracing/kprobes: Turn trace_probe->files into list_head I think that "ftrace_event_file *trace_probe[]" complicates the code for no reason, turn it into list_head to simplify the code. enable_trace_probe() no longer needs synchronize_sched(). This needs the extra sizeof(list_head) memory for every attached ftrace_event_file, hopefully not a problem in this case. Link: http://lkml.kernel.org/r/20130620173814.GA13165@redhat.com Acked-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 138 ++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f86cfd304..405b5b0f903e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,12 +35,17 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file * __rcu *files; + struct list_head files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + #define SIZEOF_TRACE_PROBE(n) \ (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) @@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; INIT_LIST_HEAD(&tp->list); + INIT_LIST_HEAD(&tp->files); return tp; error: kfree(tp->call.name); @@ -183,22 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } -/* - * This and enable_trace_probe/disable_trace_probe rely on event_mutex - * held by the caller, __ftrace_set_clr_event(). - */ -static int trace_probe_nr_files(struct trace_probe *tp) -{ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - int ret = 0; - - if (file) - while (*(file++)) - ret++; - - return ret; -} - /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -209,29 +199,18 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) int ret = 0; if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - - old = rcu_dereference_raw(tp->files); - /* 1 is for new one and 1 is for stopper */ - new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { ret = -ENOMEM; goto out; } - memcpy(new, old, n * sizeof(struct ftrace_event_file *)); - new[n] = file; - /* The last one keeps a NULL */ - rcu_assign_pointer(tp->files, new); - tp->flags |= TP_FLAG_TRACE; + link->file = file; + list_add_tail_rcu(&link->list, &tp->files); - if (old) { - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); - } + tp->flags |= TP_FLAG_TRACE; } else tp->flags |= TP_FLAG_PROFILE; @@ -245,24 +224,16 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) return ret; } -static int -trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +static struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) { - struct ftrace_event_file **files; - int i; + struct event_file_link *link; - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - files = rcu_dereference_raw(tp->files); - if (files) { - for (i = 0; files[i]; i++) - if (files[i] == file) - return i; - } + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; - return -1; + return NULL; } /* @@ -275,38 +246,23 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) int ret = 0; if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - int i, j; + struct event_file_link *link; - old = rcu_dereference_raw(tp->files); - if (n == 0 || trace_probe_file_index(tp, file) < 0) { + link = find_event_file_link(tp, file); + if (!link) { ret = -EINVAL; goto out; } - if (n == 1) { /* Remove the last file */ - tp->flags &= ~TP_FLAG_TRACE; - new = NULL; - } else { - new = kzalloc(n * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto out; - } - - /* This copy & check loop copies the NULL stopper too */ - for (i = 0, j = 0; j < n && i < n + 1; i++) - if (old[i] != file) - new[j++] = old[i]; - } + list_del_rcu(&link->list); + /* synchronize with kprobe_trace_func/kretprobe_trace_func */ + synchronize_sched(); + kfree(link); - rcu_assign_pointer(tp->files, new); + if (!list_empty(&tp->files)) + goto out; - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); + tp->flags &= ~TP_FLAG_TRACE; } else tp->flags &= ~TP_FLAG_PROFILE; @@ -871,20 +827,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kprobe_trace_func(tp, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kprobe_trace_func(tp, regs, link->file); } /* Kretprobe handler */ @@ -931,20 +877,10 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kretprobe_trace_func(tp, ri, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kretprobe_trace_func(tp, ri, regs, link->file); } /* Event entry printers */ -- cgit v1.2.3 From 10246fa35d4ffdfe472185d4cbf9c2dfd9a9f023 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 15:58:24 -0400 Subject: tracing: Use flag buffer_disabled for irqsoff tracer If the ring buffer is disabled and the irqsoff tracer records a trace it will clear out its buffer and lose the data it had previously recorded. Currently there's a callback when writing to the tracing_of file, but if tracing is disabled via the function tracer trigger, it will not inform the irqsoff tracer to stop recording. By using the "mirror" flag (buffer_disabled) in the trace_array, that keeps track of the status of the trace_array's buffer, it gives the irqsoff tracer a fast way to know if it should record a new trace or not. The flag may be a little behind the real state of the buffer, but it should not affect the trace too much. It's more important for the irqsoff tracer to be fast. Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++------------- kernel/trace/trace_irqsoff.c | 4 +- 2 files changed, 72 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4c9296b1916..0dc50711d656 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -226,9 +226,24 @@ cycle_t ftrace_now(int cpu) return ts; } +/** + * tracing_is_enabled - Show if global_trace has been disabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */ int tracing_is_enabled(void) { - return tracing_is_on(); + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; } /* @@ -341,6 +356,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_on(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_on - enable tracing buffers * @@ -349,15 +381,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void tracing_on(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_on(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 0; + tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); @@ -551,6 +575,23 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ +void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_off(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_off - turn off tracing buffers * @@ -561,15 +602,7 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); */ void tracing_off(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_off(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 1; + tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); @@ -579,14 +612,25 @@ void disable_trace_on_warning(void) tracing_off(); } +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +int tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + return ring_buffer_record_is_on(tr->trace_buffer.buffer); + return !tr->buffer_disabled; +} + /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { - if (global_trace.trace_buffer.buffer) - return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); - return !global_trace.buffer_disabled; + return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -3958,7 +4002,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_enabled() && iter->pos) + if (!tracing_is_on() && iter->pos) break; } @@ -5631,15 +5675,10 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; - if (buffer) - r = ring_buffer_record_is_on(buffer); - else - r = 0; - + r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -5661,11 +5700,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); if (val) { - ring_buffer_record_on(buffer); + tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { - ring_buffer_record_off(buffer); + tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b19d065a28cb..2aefbee93a6d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + if (!tracer_enabled || !tracing_is_enabled()) return; cpu = raw_smp_processor_id(); @@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) else return; - if (!tracer_enabled) + if (!tracer_enabled || !tracing_is_enabled()) return; data = per_cpu_ptr(tr->trace_buffer.data, cpu); -- cgit v1.2.3 From cf6735a4b103b801753748531e3658cdc8cafa5e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 19:38:11 +0200 Subject: tracing/kprobes: Don't pass addr=ip to perf_trace_buf_submit() kprobe_perf_func() and kretprobe_perf_func() pass addr=ip to perf_trace_buf_submit() for no reason. This sets perf_sample_data->addr for PERF_SAMPLE_ADDR, we already have perf_sample_data->ip initialized if PERF_SAMPLE_IP. Link: http://lkml.kernel.org/r/20130620173811.GA13161@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 405b5b0f903e..7ed6976493c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1098,8 +1098,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, - entry->ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1132,8 +1131,7 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, - entry->ret_ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From f1ed7c741fcd0c3d7d318e7c19813d89934b9296 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 27 Jun 2013 22:18:06 -0400 Subject: ftrace: Do not run selftest if command line parameter is set If the kernel command line ftrace filter parameters are set (ftrace_filter or ftrace_notrace), force the function self test to pass, with a warning why it was forced. If the user adds a filter to the kernel command line, it is assumed that they know what they are doing, and the self test should just not run instead of failing (which disables function tracing) or clearing the filter, as that will probably annoy the user. If the user wants the selftest to run, the message will tell them why it did not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 +++++ kernel/trace/trace.h | 1 + kernel/trace/trace_selftest.c | 18 ++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26e19105cdcc..67708f46baae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3537,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + static int __init set_ftrace_notrace(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } @@ -3546,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..a88939e666b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,6 +776,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct task_struct *task) { if (list_empty(&ftrace_pids)) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2901e3b88590..a7329b7902f8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -640,13 +640,20 @@ out: * Enable ftrace, sleep 1/10 second, and then read the trace * buffer to see if all is in order. */ -int +__init int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* make sure msleep has been recorded */ msleep(1); @@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) * Pretty much the same than for the function tracer from which the selftest * has been borrowed. */ -int +__init int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr) { int ret; unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs -- cgit v1.2.3 From 2d71619c59fac95a5415a326162fa046161b938c Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Mon, 1 Jul 2013 15:31:24 -0700 Subject: tracing: Make trace_marker use the correct per-instance buffer The trace_marker file was present for each new instance created, but it added the trace mark to the global trace buffer instead of to the instance's buffer. Link: http://lkml.kernel.org/r/1372717885-4543-2-git-send-email-azl@google.com Cc: David Sharp Cc: Vaibhav Nagarnaik Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0dc50711d656..e04e7119633d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4391,6 +4391,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { unsigned long addr = (unsigned long)ubuf; + struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; @@ -4450,7 +4451,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.trace_buffer.buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { -- cgit v1.2.3 From a82274151af2b075163e3c42c828529dee311487 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Mon, 1 Jul 2013 19:37:54 -0700 Subject: tracing: Protect ftrace_trace_arrays list in trace_events.c There are multiple places where the ftrace_trace_arrays list is accessed in trace_events.c without the trace_types_lock held. Link: http://lkml.kernel.org/r/1372732674-22726-1-git-send-email-azl@google.com Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e04e7119633d..e36da7ff59bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -266,7 +266,7 @@ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a88939e666b7..2c3cba59552d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -224,6 +224,8 @@ enum { extern struct list_head ftrace_trace_arrays; +extern struct mutex trace_types_lock; + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5892470bc2ee..35c6f23c71b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1008,6 +1008,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) int ret; /* Make sure the system still exists */ + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { @@ -1023,6 +1024,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) } exit_loop: mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); if (!system) return -ENODEV; @@ -1617,6 +1619,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call, int trace_add_event_call(struct ftrace_event_call *call) { int ret; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); ret = __register_event(call, NULL); @@ -1624,11 +1627,13 @@ int trace_add_event_call(struct ftrace_event_call *call) __add_event_to_tracers(call, NULL); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return ret; } /* - * Must be called under locking both of event_mutex and trace_event_sem. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { @@ -1640,11 +1645,13 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) /* Remove an event_call */ void trace_remove_event_call(struct ftrace_event_call *call) { + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); down_write(&trace_event_sem); __trace_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); } #define for_each_event(event, start, end) \ @@ -1788,6 +1795,7 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); switch (val) { case MODULE_STATE_COMING: @@ -1798,6 +1806,7 @@ static int trace_module_notify(struct notifier_block *self, break; } mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return 0; } -- cgit v1.2.3 From 4f6de4d51f4a3ab06a85e91e708cc89a513ef30c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 2 Jul 2013 15:35:11 +0930 Subject: module: don't modify argument of module_kallsyms_lookup_name() If we pass a pointer to a const string in the form "module:symbol" module_kallsyms_lookup_name() will try to split the string at the colon, i.e., will try to modify r/o data. That will, in fact, fail on a kernel with enabled CONFIG_DEBUG_RODATA. Avoid modifying the passed string in module_kallsyms_lookup_name(), modify find_module_all() instead to pass it the module name length. Signed-off-by: Mathias Krause Signed-off-by: Rusty Russell --- kernel/module.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..a1951aba7a03 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -static struct module *find_module_all(const char *name, +static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; @@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name, list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; - if (strcmp(mod->name, name) == 0) + if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) return mod; } return NULL; @@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name, struct module *find_module(const char *name) { - return find_module_all(name, false); + return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -3027,7 +3027,7 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module_all(name, true); + mod = find_module_all(name, strlen(name), true); ret = !mod || mod->state == MODULE_STATE_LIVE || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); @@ -3165,7 +3165,8 @@ static int add_unformed_module(struct module *mod) again: mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { + old = find_module_all(mod->name, strlen(mod->name), true); + if (old != NULL) { if (old->state == MODULE_STATE_COMING || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ @@ -3576,10 +3577,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); if ((colon = strchr(name, ':')) != NULL) { - *colon = '\0'; - if ((mod = find_module(name)) != NULL) + if ((mod = find_module_all(name, colon - name, false)) != NULL) ret = mod_find_symname(mod, colon+1); - *colon = ':'; } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) -- cgit v1.2.3 From b634d130e46a093ddf716ae9cf1bfa258ede36cf Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 2 Jul 2013 15:35:11 +0930 Subject: There is no /sys/parameters There is no such path as /sys/parameters, module parameters live in /sys/module/*/parameters. Signed-off-by: Jean Delvare Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 53b958fcd639..440e65d1a544 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name, } /* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * -- cgit v1.2.3 From 54041d8a73337411b485ff76957fb106cb5d40d0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 2 Jul 2013 15:35:12 +0930 Subject: modules: don't fail to load on unknown parameters. Although parameters are supposed to be part of the kernel API, experimental parameters are often removed. In addition, downgrading a kernel might cause previously-working modules to fail to load. On balance, it's probably better to warn, and load the module anyway. This may let through a typo, but at least the logs will show it. Reported-by: Andy Lutomirski Signed-off-by: Rusty Russell --- kernel/module.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a1951aba7a03..5184877ce98a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3212,6 +3212,17 @@ out: return err; } +static int unknown_module_param_cb(char *param, char *val, const char *modname) +{ + /* Check for magic 'dyndbg' arg */ + int ret = ddebug_dyndbg_module_param_cb(param, val, modname); + if (ret != 0) { + printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", + modname, param); + } + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, @@ -3298,7 +3309,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, &ddebug_dyndbg_module_param_cb); + -32768, 32767, unknown_module_param_cb); if (err < 0) goto bug_cleanup; -- cgit v1.2.3 From c9b5a266b103af873abb9ac03bc3d067702c8f4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jun 2013 12:17:32 +0200 Subject: tick: Make oneshot broadcast robust vs. CPU offlining In periodic mode we remove offline cpus from the broadcast propagation mask. In oneshot mode we fail to do so. This was not a problem so far, but the recent changes to the broadcast propagation introduced a constellation which can result in a NULL pointer dereference. What happens is: CPU0 CPU1 idle() arch_idle() tick_broadcast_oneshot_control(OFF); set cpu1 in tick_broadcast_force_mask if (cpu_offline()) arch_cpu_dead() cpu_dead_cleanup(cpu1) cpu1 tickdevice pointer = NULL broadcast interrupt dereference cpu1 tickdevice pointer -> OOPS We dereference the pointer because cpu1 is still set in tick_broadcast_force_mask and tick_do_broadcast() expects a valid cpumask and therefor lacks any further checks. Remove the cpu from the tick_broadcast_force_mask before we set the tick device pointer to NULL. Also add a sanity check to the oneshot broadcast function, so we can detect such issues w/o crashing the machine. Reported-by: Prarit Bhargava Cc: athorlton@sgi.com Cc: CAI Qian Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1306261303260.4013@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d067c01586f5..4790037163f6 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -532,6 +532,13 @@ again: cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + /* * Wakeup the cpus which have an expired event. */ @@ -773,10 +780,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From 1f73a9806bdd07a5106409bbcab3884078bd34fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 1 Jul 2013 22:14:10 +0200 Subject: tick: Prevent uncontrolled switch to oneshot mode When the system switches from periodic to oneshot mode, the broadcast logic causes a possibility that a CPU which has not yet switched to oneshot mode puts its own clock event device into oneshot mode without updating the state and the timer handler. CPU0 CPU1 per cpu tickdev is in periodic mode and switched to broadcast Switch to oneshot mode tick_broadcast_switch_to_oneshot() cpumask_copy(tick_oneshot_broacast_mask, tick_broadcast_mask); broadcast device mode = oneshot Timer interrupt irq_enter() tick_check_oneshot_broadcast() dev->set_mode(ONESHOT); tick_handle_periodic() if (dev->mode == ONESHOT) dev->next_event += period; FAIL. We fail, because dev->next_event contains KTIME_MAX, if the device was in periodic mode before the uncontrolled switch to oneshot happened. We must copy the broadcast bits over to the oneshot mask, because otherwise a CPU which relies on the broadcast would not been woken up anymore after the broadcast device switched to oneshot mode. So we need to verify in tick_check_oneshot_broadcast() whether the CPU has already switched to oneshot mode. If not, leave the device untouched and let the CPU switch controlled into oneshot mode. This is a long standing bug, which was never noticed, because the main user of the broadcast x86 cannot run into that scenario, AFAICT. The nonarchitected timer mess of ARM creates a gazillion of differently broken abominations which trigger the shortcomings of that broadcast code, which better had never been necessary in the first place. Reported-and-tested-by: Stehle Vincent-B46079 Reviewed-by: Stephen Boyd Cc: John Stultz , Cc: Mark Rutland Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307012153060.4013@ionos.tec.linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 4790037163f6..248f80dba746 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -492,7 +492,15 @@ void tick_check_oneshot_broadcast(int cpu) if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_set_mode(td->evtdev, + CLOCK_EVT_MODE_ONESHOT); + } } } -- cgit v1.2.3 From 07bd1172902e782f288e4d44b1fde7dec0f08b6f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 1 Jul 2013 22:14:10 +0200 Subject: tick: Sanitize broadcast control logic The recent implementation of a generic dummy timer resulted in a different registration order of per cpu local timers which made the broadcast control logic go belly up. If the dummy timer is the first clock event device which is registered for a CPU, then it is installed, the broadcast timer is initialized and the CPU is marked as broadcast target. If a real clock event device is installed after that, we can fail to take the CPU out of the broadcast mask. In the worst case we end up with two periodic timer events firing for the same CPU. One from the per cpu hardware device and one from the broadcast. Now the problem is that we have no way to distinguish whether the system is in a state which makes broadcasting necessary or the broadcast bit was set due to the nonfunctional dummy timer installment. To solve this we need to keep track of the system state seperately and provide a more detailed decision logic whether we keep the CPU in broadcast mode or not. The old decision logic only clears the broadcast mode, if the newly installed clock event device is not affected by power states. The new logic clears the broadcast mode if one of the following is true: - The new device is not affected by power states. - The system is not in a power state affected mode - The system has switched to oneshot mode. The oneshot broadcast is controlled from the deep idle state. The CPU is not in idle at this point, so it's safe to remove it from the mask. If we clear the broadcast bit for the CPU when a new device is installed, we also shutdown the broadcast device when this was the last CPU in the broadcast mask. If the broadcast bit is kept, then we leave the new device in shutdown state and rely on the broadcast to deliver the timer interrupts via the broadcast ipis. Reported-and-tested-by: Stehle Vincent-B46079 Reviewed-by: Stephen Boyd Cc: John Stultz , Cc: Mark Rutland Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307012153060.4013@ionos.tec.linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 70 +++++++++++++++++++++++++++++++++++++------- kernel/time/tick-common.c | 3 +- 2 files changed, 61 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 248f80dba746..4430fa695b48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -30,6 +30,7 @@ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -140,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret = 0; + int ret; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -155,20 +157,59 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + tick_broadcast_start_periodic(bc); ret = 1; } else { /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. + * Clear the broadcast bit for this cpu if the + * device is not power state affected. */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); - tick_broadcast_clear_oneshot(cpu); - } else { + else tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device. + */ + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + /* Nothing to do */ + ret = 0; + break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -298,6 +339,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) @@ -307,8 +349,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_force) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -366,6 +412,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) @@ -821,6 +868,7 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index edd45f64162f..64522ecdfe0e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td, * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic - * way. + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; -- cgit v1.2.3 From ff451961a8b2a17667a7bfa39c86fb9b351445db Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 22:50:29 -0400 Subject: tracing: Add trace_array_get/put() to handle instance refs better Commit a695cb58162 "tracing: Prevent deleting instances when they are being read" tried to fix a race between deleting a trace instance and reading contents of a trace file. But it wasn't good enough. The following could crash the kernel: # cd /sys/kernel/debug/tracing/instances # ( while :; do mkdir foo; rmdir foo; done ) & # ( while :; do cat foo/trace &> /dev/null; done ) & Luckily this can only be done by root user, but it should be fixed regardless. The problem is that a delete of the file can happen after the reader starts to open the file but before it grabs the trace_types_mutex. The solution is to validate the trace array before using it. If the trace array does not exist in the list of trace arrays, then it returns -ENODEV. There's a possibility that a trace_array could be deleted and a new one created and the open would open its file instead. But that is very minor as it will just return the data of the new trace array, it may confuse the user but it will not crash the system. As this can only be done by root anyway, the race will only occur if root is deleting what its trying to read at the same time. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 83 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e36da7ff59bf..6be9df1aa513 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -204,6 +204,37 @@ static struct trace_array global_trace; LIST_HEAD(ftrace_trace_arrays); +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ + WARN_ON(!this_tr->ref); + this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} + int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) @@ -2831,10 +2862,9 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file, bool snapshot) +__tracing_open(struct trace_array *tr, struct trace_cpu *tc, + struct inode *inode, struct file *file, bool snapshot) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; struct trace_iterator *iter; int cpu; @@ -2913,8 +2943,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) tracing_iter_reset(iter, cpu); } - tr->ref++; - mutex_unlock(&trace_types_lock); return iter; @@ -2944,17 +2972,20 @@ static int tracing_release(struct inode *inode, struct file *file) struct trace_array *tr; int cpu; - if (!(file->f_mode & FMODE_READ)) + /* Writes do not use seq_file, need to grab tr from inode */ + if (!(file->f_mode & FMODE_READ)) { + struct trace_cpu *tc = inode->i_private; + + trace_array_put(tc->tr); return 0; + } iter = m->private; tr = iter->tr; + trace_array_put(tr); mutex_lock(&trace_types_lock); - WARN_ON(!tr->ref); - tr->ref--; - for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2973,20 +3004,23 @@ static int tracing_release(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); + return 0; } static int tracing_open(struct inode *inode, struct file *file) { + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - if (tc->cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->trace_buffer); else @@ -2994,12 +3028,16 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, false); + iter = __tracing_open(tr, tc, inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } + + if (ret < 0) + trace_array_put(tr); + return ret; } @@ -4575,12 +4613,16 @@ struct ftrace_buffer_info { static int tracing_snapshot_open(struct inode *inode, struct file *file) { struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, true); + iter = __tracing_open(tr, tc, inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { @@ -4593,13 +4635,16 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) kfree(m); return -ENOMEM; } - iter->tr = tc->tr; + iter->tr = tr; iter->trace_buffer = &tc->tr->max_buffer; iter->cpu_file = tc->cpu; m->private = iter; file->private_data = m; } + if (ret < 0) + trace_array_put(tr); + return ret; } @@ -4680,9 +4725,12 @@ out: static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) - return tracing_release(inode, file); + return ret; /* If write only, the seq_file is just a stub */ if (m) @@ -4927,8 +4975,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - WARN_ON(!iter->tr->ref); - iter->tr->ref--; + __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); -- cgit v1.2.3 From 7b85af63034818e43aee6c1d7bf1c7c6796a9073 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 1 Jul 2013 23:34:22 -0400 Subject: tracing: Get trace_array ref counts when accessing trace files When a trace file is opened that may access a trace array, it must increment its ref count to prevent it from being deleted. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6be9df1aa513..6d9bd9b43e43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2965,6 +2965,43 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; + +} + +int tracing_open_generic_tc(struct inode *inode, struct file *filp) +{ + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; + +} + static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; @@ -3008,6 +3045,32 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return 0; +} + +static int tracing_release_generic_tc(struct inode *inode, struct file *file) +{ + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; + + trace_array_put(tr); + return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + static int tracing_open(struct inode *inode, struct file *file) { struct trace_cpu *tc = inode->i_private; @@ -3394,9 +3457,14 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + return single_open(file, tracing_trace_options_show, inode->i_private); } @@ -3404,7 +3472,7 @@ static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; @@ -3892,6 +3960,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + mutex_lock(&trace_types_lock); /* create a buffer to store the information to pass to userspace */ @@ -3944,6 +4015,7 @@ out: fail: kfree(iter->trace); kfree(iter); + __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } @@ -3951,6 +4023,8 @@ fail: static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; mutex_lock(&trace_types_lock); @@ -3964,6 +4038,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter); + trace_array_put(tr); + return 0; } @@ -4421,6 +4497,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + trace_array_put(tr); + return 0; } @@ -4597,10 +4675,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, static int tracing_clock_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + int ret; + if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, inode->i_private); + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } struct ftrace_buffer_info { @@ -4796,34 +4884,38 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tc, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tc, }; static const struct file_operations tracing_total_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .write = tracing_mark_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_clock_write, }; @@ -4851,13 +4943,19 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) struct trace_cpu *tc = inode->i_private; struct trace_array *tr = tc->tr; struct ftrace_buffer_info *info; + int ret; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) + if (!info) { + trace_array_put(tr); return -ENOMEM; + } mutex_lock(&trace_types_lock); @@ -4875,7 +4973,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_unlock(&trace_types_lock); - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; } static unsigned int @@ -5765,9 +5867,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, } static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, + .release = tracing_release_generic_tr, .llseek = default_llseek, }; -- cgit v1.2.3 From 8e2e2fa47129532a30cff6c25a47078dc97d9260 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 15:30:53 -0400 Subject: tracing: Add trace_array_get/put() to event handling Commit a695cb58162 "tracing: Prevent deleting instances when they are being read" tried to fix a race between deleting a trace instance and reading contents of a trace file. But it wasn't good enough. The following could crash the kernel: # cd /sys/kernel/debug/tracing/instances # ( while :; do mkdir foo; rmdir foo; done ) & # ( while :; do echo 1 > foo/events/sched/sched_switch 2> /dev/null; done ) & Luckily this can only be done by root user, but it should be fixed regardless. The problem is that a delete of the file can happen after the write to the event is opened, but before the enabling happens. The solution is to make sure the trace_array is available before succeeding in opening for write, and incerment the ref counter while opened. Now the instance can be deleted when the events are writing to the buffer, but the deletion of the instance will disable all events before the instance is actually deleted. Cc: stable@vger.kernel.org # 3.10 Reported-by: Alexander Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 55 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2c3cba59552d..c7fbf93f1b7c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -226,6 +226,9 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 35c6f23c71b2..920e08fb53b3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -409,6 +409,35 @@ static void put_system(struct ftrace_subsystem_dir *dir) mutex_unlock(&event_mutex); } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_file(struct inode *inode, struct file *filp) +{ + struct ftrace_event_file *file = inode->i_private; + struct trace_array *tr = file->tr; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int tracing_release_generic_file(struct inode *inode, struct file *filp) +{ + struct ftrace_event_file *file = inode->i_private; + struct trace_array *tr = file->tr; + + trace_array_put(tr); + + return 0; +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1032,9 +1061,17 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Some versions of gcc think dir can be uninitialized here */ WARN_ON(!dir); + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); + return -ENODEV; + } + ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); put_system(dir); + } return ret; } @@ -1045,16 +1082,23 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) + if (!dir) { + trace_array_put(tr); return -ENOMEM; + } dir->tr = tr; ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); kfree(dir); + } filp->private_data = dir; @@ -1065,6 +1109,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct ftrace_subsystem_dir *dir = file->private_data; + trace_array_put(dir->tr); + /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable @@ -1192,9 +1238,10 @@ static const struct file_operations ftrace_set_event_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_file, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_generic_file, .llseek = default_llseek, }; -- cgit v1.2.3 From 2a6c24afab70dbcfee49f4c76e1511eec1a3298b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 14:48:23 -0400 Subject: tracing: Fix race between deleting buffer and setting events While analyzing the code, I discovered that there's a potential race between deleting a trace instance and setting events. There are a few races that can occur if events are being traced as the buffer is being deleted. Mostly the problem comes with freeing the descriptor used by the trace event callback. To prevent problems like this, the events are disabled before the buffer is deleted. The problem with the current solution is that the event_mutex is let go between disabling the events and freeing the files, which means that the events could be enabled again while the freeing takes place. Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 920e08fb53b3..7d854290bf81 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -441,14 +441,14 @@ static int tracing_release_generic_file(struct inode *inode, struct file *filp) /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; - mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; @@ -474,6 +474,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, ret = 0; } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; @@ -2408,11 +2419,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) int event_trace_del_tracer(struct trace_array *tr) { - /* Disable any running events */ - __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); - mutex_lock(&event_mutex); + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); -- cgit v1.2.3 From fa44063f9ef163c3a4c8d8c0465bb8a056b42035 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Thu, 13 Jun 2013 14:21:51 +0800 Subject: uprobes: Fix return value in error handling path When wrong argument is passed into uprobe_events it does not return an error: [root@jovi tracing]# echo 'p:myprobe /bin/bash' > uprobe_events [root@jovi tracing]# The proper response is: [root@jovi tracing]# echo 'p:myprobe /bin/bash' > uprobe_events -bash: echo: write error: Invalid argument Link: http://lkml.kernel.org/r/51B964FF.5000106@huawei.com Cc: Frederic Weisbecker Cc: Cc: stable@vger.kernel.org # 3.5+ Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 32494fb0ee64..d5d0cd368a56 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -283,8 +283,10 @@ static int create_trace_uprobe(int argc, char **argv) return -EINVAL; } arg = strchr(argv[1], ':'); - if (!arg) + if (!arg) { + ret = -EINVAL; goto fail_address_parse; + } *arg++ = '\0'; filename = argv[1]; -- cgit v1.2.3 From 11034ae9c20f4057a6127fc965906417978e69b2 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:23 +0800 Subject: tracing: Fix irqs-off tag display in syscall tracing All syscall tracing irqs-off tags are wrong, the syscall enter entry doesn't disable irqs. [root@jovi tracing]#echo "syscalls:sys_enter_open" > set_event [root@jovi tracing]# cat trace # tracer: nop # # entries-in-buffer/entries-written: 13/13 #P:2 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | irqbalance-513 [000] d... 56115.496766: sys_open(filename: 804e1a6, flags: 0, mode: 1b6) irqbalance-513 [000] d... 56115.497008: sys_open(filename: 804e1bb, flags: 0, mode: 1b6) sendmail-771 [000] d... 56115.827982: sys_open(filename: b770e6d1, flags: 0, mode: 1b6) The reason is syscall tracing doesn't record irq_flags into buffer. The proper display is: [root@jovi tracing]#echo "syscalls:sys_enter_open" > set_event [root@jovi tracing]# cat trace # tracer: nop # # entries-in-buffer/entries-written: 14/14 #P:2 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | irqbalance-514 [001] .... 46.213921: sys_open(filename: 804e1a6, flags: 0, mode: 1b6) irqbalance-514 [001] .... 46.214160: sys_open(filename: 804e1bb, flags: 0, mode: 1b6) <...>-920 [001] .... 47.307260: sys_open(filename: 4e82a0c5, flags: 80000, mode: 0) Link: http://lkml.kernel.org/r/1365564393-10972-3-git-send-email-jovi.zhangwei@huawei.com Cc: stable@vger.kernel.org # 2.6.35 Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8f2ac73c7a5f..322e16461072 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; int size; @@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->enter_event->event.type, size, 0, 0); + sys_data->enter_event->event.type, size, irq_flags, pc); if (!event) return; @@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!filter_current_check_discard(buffer, sys_data->enter_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->exit_event->event.type, sizeof(*entry), 0, 0); + sys_data->exit_event->event.type, sizeof(*entry), + irq_flags, pc); if (!event) return; @@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!filter_current_check_discard(buffer, sys_data->exit_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, -- cgit v1.2.3 From 5280bcef91e706770cc1706eb97353e3513322b9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 19:59:57 -0400 Subject: tracing: Make tracer_tracing_{off,on,is_on}() static I have patches that will use tracer_tracing_on/off/is_on() in other files, but as they are not ready to be merged yet, and Fengguang Wu's sparse scripts pointed out that these functions were not declared anywhere, I'll make them static for now. When these functions are required to be used elsewhere, I'll remove the static then. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6d9bd9b43e43..48aceb8a0328 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -387,7 +387,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; -void tracer_tracing_on(struct trace_array *tr) +static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -606,7 +606,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -void tracer_tracing_off(struct trace_array *tr) +static void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); @@ -649,7 +649,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -int tracer_tracing_is_on(struct trace_array *tr) +static int tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); -- cgit v1.2.3 From 4480361c3c592fcbce3ef74e030719f0715e3a7e Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:28 +0800 Subject: tracing: Remove TRACE_EVENT_TYPE enum definition TRACE_EVENT_TYPE enum is not used at present, remove it. Link: http://lkml.kernel.org/r/1365564393-10972-8-git-send-email-jovi.zhangwei@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c7fbf93f1b7c..1cbba04976b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -907,12 +907,6 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { - TRACE_EVENT_TYPE_PRINTF = 1, - TRACE_EVENT_TYPE_RAW = 2, -}; - struct ftrace_event_field { struct list_head link; const char *name; -- cgit v1.2.3 From 8de1eb02778b64f8b292db531cf39a429f84315f Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Wed, 10 Apr 2013 11:26:30 +0800 Subject: tracing: Remove ftrace() function The only caller of function ftrace(...) was removed a long time ago, so remove the function body as well. Link: http://lkml.kernel.org/r/1365564393-10972-10-git-send-email-jovi.zhangwei@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 --------- kernel/trace/trace.h | 5 ----- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 48aceb8a0328..f6fed9e51c64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1637,15 +1637,6 @@ trace_function(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, ip, parent_ip, flags, pc); -} - #ifdef CONFIG_STACKTRACE #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1cbba04976b4..a4ed382dea2f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -559,11 +559,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu); void poll_wait_pipe(struct trace_iterator *iter); -void ftrace(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long ip, - unsigned long parent_ip, - unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, -- cgit v1.2.3 From dcc302232c1f9b3ca16f6b8ee190eb0b1a8a0da3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 2 Jul 2013 20:30:52 -0400 Subject: tracing: Make tracing_open_generic_{tr,tc}() static I have patches that will use tracing_open_generic_tr/tc() in other files, but as they are not ready to be merged yet, and Fengguang Wu's sparse scripts pointed out that these functions were not declared anywhere, I'll make them static for now. When these functions are required to be used elsewhere, I'll remove the static then. Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6fed9e51c64..dc473b51415f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2960,7 +2960,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -int tracing_open_generic_tr(struct inode *inode, struct file *filp) +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -2976,7 +2976,7 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) } -int tracing_open_generic_tc(struct inode *inode, struct file *filp) +static int tracing_open_generic_tc(struct inode *inode, struct file *filp) { struct trace_cpu *tc = inode->i_private; struct trace_array *tr = tc->tr; -- cgit v1.2.3 From 8d8022e8aba85192e937f1f0f7450e256d66ae5c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Jul 2013 10:06:28 +0930 Subject: module: do percpu allocation after uniqueness check. No, really! v3.8-rc1-5-g1fb9341 was supposed to stop parallel kvm loads exhausting percpu memory on large machines: Now we have a new state MODULE_STATE_UNFORMED, we can insert the module into the list (and thus guarantee its uniqueness) before we allocate the per-cpu region. In my defence, it didn't actually say the patch did this. Just that we "can". This patch actually *does* it. Signed-off-by: Rusty Russell Tested-by: Jim Hull Cc: stable@kernel.org # 3.8 --- kernel/module.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5184877ce98a..d1a161be7b04 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2940,7 +2940,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; - Elf_Shdr *pcpusec; int err; mod = setup_load_info(info, flags); @@ -2955,17 +2954,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) err = module_frob_arch_sections(info->hdr, info->sechdrs, info->secstrings, mod); if (err < 0) - goto out; + return ERR_PTR(err); - pcpusec = &info->sechdrs[info->index.pcpu]; - if (pcpusec->sh_size) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, - pcpusec->sh_size, pcpusec->sh_addralign); - if (err) - goto out; - pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; - } + /* We will do a special allocation for per-cpu sections later. */ + info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2976,17 +2968,22 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_percpu; + return ERR_PTR(err); /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; +} -free_percpu: - percpu_modfree(mod); -out: - return ERR_PTR(err); +static int alloc_module_percpu(struct module *mod, struct load_info *info) +{ + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + if (!pcpusec->sh_size) + return 0; + + /* We have a special allocation for this section. */ + return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign); } /* mod is no longer valid after this! */ @@ -3262,6 +3259,11 @@ static int load_module(struct load_info *info, const char __user *uargs, } #endif + /* To avoid stressing percpu allocator, do this once we're unique. */ + err = alloc_module_percpu(mod, info); + if (err) + goto unlink_mod; + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) -- cgit v1.2.3 From 9eb76d7797b892a1dad4f2efb6f786681306dd13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Jul 2013 10:06:29 +0930 Subject: module: cleanup call chain. Fold alloc_module_percpu into percpu_modalloc(). Signed-off-by: Rusty Russell --- kernel/module.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d1a161be7b04..c18107942ac2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod) return mod->percpu; } -static int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + unsigned long align = pcpusec->sh_addralign; + + if (!pcpusec->sh_size) + return 0; + if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - mod->percpu = __alloc_reserved_percpu(size, align); + mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { printk(KERN_WARNING "%s: Could not allocate %lu bytes percpu data\n", - mod->name, size); + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } - mod->percpu_size = size; + mod->percpu_size = pcpusec->sh_size; return 0; } @@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { - return -ENOMEM; + /* UP modules shouldn't have this section: ENOMEM isn't quite right */ + if (info->sechdrs[info->index.pcpu].sh_size != 0) + return -ENOMEM; + return 0; } static inline void percpu_modfree(struct module *mod) { @@ -2976,16 +2983,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) return mod; } -static int alloc_module_percpu(struct module *mod, struct load_info *info) -{ - Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; - if (!pcpusec->sh_size) - return 0; - - /* We have a special allocation for this section. */ - return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign); -} - /* mod is no longer valid after this! */ static void module_deallocate(struct module *mod, struct load_info *info) { @@ -3260,7 +3257,7 @@ static int load_module(struct load_info *info, const char __user *uargs, #endif /* To avoid stressing percpu allocator, do this once we're unique. */ - err = alloc_module_percpu(mod, info); + err = percpu_modalloc(mod, info); if (err) goto unlink_mod; -- cgit v1.2.3 From 55ccb616a6e42052edb37e9c4f82cf8854a59429 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timer: consolidate expiry time type The posix cpu timer expiry time is stored in a union of two types: a 64 bits field if we rely on scheduler precise accounting, or a cputime_t if we rely on jiffies. This results in quite some duplicate code and special cases to handle the two types. Just unify this into a single 64 bits field. cputime_t can always fit into it. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 266 ++++++++++++++++++---------------------------- 1 file changed, 106 insertions(+), 160 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c3c4ea1225a4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -461,30 +414,30 @@ static void cleanup_timers(struct list_head *head, list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(ptime)) { + timer->expires = 0; } else { - timer->expires.cpu -= ptime; + timer->expires -= cputime_to_expires(ptime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(utime)) { + timer->expires = 0; } else { - timer->expires.cpu -= utime; + timer->expires -= cputime_to_expires(utime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; + if (timer->expires < sum_exec_runtime) { + timer->expires = 0; } else { - timer->expires.sched -= sum_exec_runtime; + timer->expires -= sum_exec_runtime; } } } @@ -516,7 +469,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *timer, unsigned long long now) { /* * That's all for this thread or process. @@ -524,9 +477,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) */ put_task_struct(timer->it.cpu.task); timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + timer->it.cpu.expires -= now; } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +509,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +527,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +552,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +583,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +592,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +645,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +700,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +715,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +739,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +749,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +773,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +795,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +805,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +837,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +858,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -946,8 +891,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; + if (!--maxfire || prof_ticks(tsk) < t->expires) { + tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -961,8 +906,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; + if (!--maxfire || virt_ticks(tsk) < t->expires) { + tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -976,8 +921,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { + tsk->cputime_expires.sched_exp = t->expires; break; } t->firing = 1; @@ -1030,7 +975,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1068,7 +1014,7 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,8 +1024,8 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = 0; @@ -1087,8 +1033,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; + if (!--maxfire || ptime < tl->expires) { + prof_expires = tl->expires; break; } tl->firing = 1; @@ -1102,8 +1048,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; + if (!--maxfire || utime < tl->expires) { + virt_expires = tl->expires; break; } tl->firing = 1; @@ -1117,8 +1063,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires) { + sched_expires = tl->expires; break; } tl->firing = 1; @@ -1162,8 +1108,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1151,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1387,7 +1333,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1345,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1405,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. -- cgit v1.2.3 From 1a7fa510b38e518d11365883934f1afa41625424 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timers: consolidate timer list cleanups Cleaning up the posix cpu timers on task exit shares some common code among timer list types, most notably the list traversal and expiry time update. Unify this in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 48 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c3c4ea1225a4..b1450cee6d6d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -399,6 +399,21 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (timer->expires < curr) { + timer->expires = 0; + } else { + timer->expires -= curr; + } + } +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -409,37 +424,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(ptime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(ptime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(utime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(utime); - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < sum_exec_runtime) { - timer->expires = 0; - } else { - timer->expires -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* -- cgit v1.2.3 From 2473f3e7a97ce8bc0fe7596cdb361b21221418eb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_cpu_timers: consolidate expired timers check Consolidate the common code amongst per thread and per process timers list on tick time. List traversal, expiry check and subsequent updates can be shared in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 118 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b1450cee6d6d..92a4fbf44f86 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -862,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -870,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires) { - tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires) { - tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { - tsk->cputime_expires.sched_exp = t->expires; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1002,7 +990,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; @@ -1017,49 +1004,10 @@ static void check_process_timers(struct task_struct *tsk, utime = cputime_to_expires(cputime.utime); ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires) { - prof_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires) { - virt_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires) { - sched_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. -- cgit v1.2.3 From 76cdcdd979ce00f5037804d73da583fb488ec1b2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix-timers: correctly get dying task time sample in posix_cpu_timer_schedule() In order to re-arm a timer after it fired, we take a sample of the current process or thread cputime. If the task is dying though, we don't arm anything but we cache the remaining timer expiration delta for further reads. Something similar is performed in posix_cpu_timer_get() but here we forget to take the process wide cputime sample before caching it. As a result we are storing random stack content, leading every further reads of that timer to return junk values. Fix this by taking the appropriate sample in the case of process wide timers. This probably doesn't matter much in practice because, at this stage, the thread is the last one in the group and we reached exit_notify(). This implies that we called exit_itimers() and there should be no more timers to handle for that task. So this is likely dead code anyway but let's fix the current logic and the warning that came along: kernel/posix-cpu-timers.c: In function 'posix_cpu_timer_schedule': kernel/posix-cpu-timers.c:1127: warning: 'now' may be used uninitialized in this function Then we can start to think further about cleaning up that code. Reported-by: Andrew Morton Reported-by: Chen Gang Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Chen Gang Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 92a4fbf44f86..4ebd8ad07c66 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1097,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } -- cgit v1.2.3 From a0b2062b0904ef07944c4a6e4d0f88ee44f1e9f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_timers: fix racy timer delta caching on task exit When a task exits, we perform a caching of the remaining cputime delta before expiring of its timers. This is done from the following places: * When the task is reaped. We iterate through its list of posix cpu timers and store the remaining timer delta to the timer struct instead of the absolute value. (See posix_cpu_timers_exit() / posix_cpu_timers_exit_group() ) * When we call posix_cpu_timer_get() or posix_cpu_timer_schedule(). If the timer's task is considered dying when watched from these places, the same conversion from absolute to relative expiry time is performed. Then the given task's reference is released. (See clear_dead_task() ). The relevance of this caching is questionable but this is another and deeper debate. The big issue here is that these two sources of caching don't mix up very well together. More specifically, the caching can easily be done twice, resulting in a wrong delta as it gets spuriously substracted a second time by the elapsed clock. This can happen in the following scenario: 1) The task exits and gets reaped: we call posix_cpu_timers_exit() and the absolute timer expiry values are converted to a relative delta. 2) timer_gettime() -> posix_cpu_timer_get() is called and relies on clear_dead_task() because tsk->exit_state == EXIT_DEAD. The delta gets substracted again by the elapsed clock and we return a wrong result. To fix this, just remove the caching done on task reaping time. It doesn't bring much value on its own. The caching done from posix_cpu_timer_get/schedule is enough. And it would also be hard to get it really right: we could make it put and clear the target task in the timer struct so that readers know if they are dealing with a relative cached of absolute value. But it would be racy. The only safe way to do it would be to lock the itimer->it_lock so that we know nobody reads the cputime expiry value while we modify it and its target task reference. Doing so would involve some funny workarounds to avoid circular lock against the sighand lock. There is just no reason to maintain this. The user visible effect of this patch can be observed by running the following code: it creates a subthread that launches a posix cputimer which expires after 10 seconds. But then the subthread only busy loops for 2 seconds and exits. The parent reaps the subthread and read the timer value. Its expected value should the be the initial timer's expiration value minus the cputime elapsed in the subthread. Roughly 10 - 2 = 8 seconds: #include #include #include #include #include static timer_t id; static struct itimerspec val = { .it_value.tv_sec = 10, }, new; static void *thread(void *unused) { int err; struct timeval start, end, diff; timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return NULL; } /* Arm 10 sec timer */ err = timer_settime(id, 0, &val, NULL); if (err < 0) { perror("Can't set timer\n"); return NULL; } /* Exit after 2 seconds of execution */ gettimeofday(&start, NULL); do { gettimeofday(&end, NULL); timersub(&end, &start, &diff); } while (diff.tv_sec < 2); return NULL; } int main(int argc, char **argv) { pthread_t pthread; int err; err = pthread_create(&pthread, NULL, thread, NULL); if (err) { perror("Can't create thread\n"); return -1; } pthread_join(pthread, NULL); /* Just wait a little bit to make sure the child got reaped */ sleep(1); err = timer_gettime(id, &new); if (err) perror("Can't get timer value\n"); printf("%d %ld\n", new.it_value.tv_sec, new.it_value.tv_nsec); return 0; } Before the patch: $ ./posix_cpu_timers 6 2278074 After the patch: $ ./posix_cpu_timers 8 1158766 Before the patch, the elapsed time got two more seconds spuriously accounted. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4ebd8ad07c66..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -404,14 +404,8 @@ static void cleanup_timers_list(struct list_head *head, { struct cpu_timer_list *timer, *next; - list_for_each_entry_safe(timer, next, head, entry) { + list_for_each_entry_safe(timer, next, head, entry) list_del_init(&timer->entry); - if (timer->expires < curr) { - timer->expires = 0; - } else { - timer->expires -= curr; - } - } } /* @@ -459,15 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, unsigned long long now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires -= now; + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) -- cgit v1.2.3 From 0ed5fd138539940a493dc69359cb2f49de70ad89 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:43 -0700 Subject: mm: use totalram_pages instead of num_physpages at runtime The global variable num_physpages is scheduled to be removed, so use totalram_pages instead of num_physpages at runtime. Signed-off-by: Jiang Liu Cc: Miklos Szeredi Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807d..8b5d1cd933f4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1651,7 +1651,7 @@ unsigned long snapshot_get_image_size(void) static int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); - info->num_physpages = num_physpages; + info->num_physpages = get_num_physpages(); info->image_pages = nr_copy_pages; info->pages = snapshot_get_image_size(); info->size = info->pages; @@ -1795,7 +1795,7 @@ static int check_header(struct swsusp_info *info) char *reason; reason = check_image_kernel(info); - if (!reason && info->num_physpages != num_physpages) + if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { printk(KERN_ERR "PM: Image mismatch: %s\n", reason); -- cgit v1.2.3 From f170168b9a0b61ea1e647b082b38f605f1d3de3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jul 2013 15:04:58 -0700 Subject: drivers: avoid parsing names as kthread_run() format strings Calling kthread_run with a single name parameter causes it to be handled as a format string. Many callers are passing potentially dynamic string content, so use "%s" in those cases to avoid any potential accidents. Signed-off-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cf3adc6fe001..e08abb9461ac 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -3026,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); -- cgit v1.2.3 From 7ec75e1ca1bd35872a5c7b33da4b05395bc74364 Mon Sep 17 00:00:00 2001 From: liguang Date: Wed, 3 Jul 2013 15:05:00 -0700 Subject: kernel/sys.c: sys_reboot(): fix malformed panic message If LINUX_REBOOT_CMD_HALT for reboot failed, the message "cannot halt" will stay on the same line with the next message, so append a '\n'. Signed-off-by: liguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2bbd9a73b54c..c1da757a97b0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -511,7 +511,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_HALT: kernel_halt(); do_exit(0); - panic("cannot halt"); + panic("cannot halt.\n"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); -- cgit v1.2.3 From 45c64940c8bb64a042464ecec89d95eb4cce9b07 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:05:01 -0700 Subject: kernel/sys.c:do_sysinfo(): use get_monotonic_boottime() Change do_sysinfo() to use get_monotonic_boottime() instead of do_posix_clock_monotonic_gettime() + monotonic_to_bootbased(). Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: John Stultz Cc: Tomas Janousek Cc: Tomas Smetana Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c1da757a97b0..7bf50dcc6d53 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2355,8 +2355,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); + get_monotonic_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); -- cgit v1.2.3 From 10fb46d5f79147620d0afda7d3d51302a1e38191 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 3 Jul 2013 15:05:39 -0700 Subject: kprobes: handle empty/invalid input to debugfs "enabled" file When writing invalid input to 'debug/kprobes/enabled' it'll silently be ignored. Even worse, when writing an empty string to this file, the outcome is purely random as the switch statement will make its decision based on the value of an uninitialized stack variable. Fix this by handling invalid/empty input as error returning -EINVAL. Signed-off-by: Mathias Krause Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index bddf3b201a48..6e33498d665c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2332,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file, if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; + buf[buf_size] = '\0'; switch (buf[0]) { case 'y': case 'Y': @@ -2343,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file, case '0': disarm_all_kprobes(); break; + default: + return -EINVAL; } return count; -- cgit v1.2.3 From 29000caecbe87b6b66f144f72111f0d02fbbf0c1 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 3 Jul 2013 15:08:12 -0700 Subject: ptrace: add ability to get/set signal-blocked mask crtools uses a parasite code for dumping processes. The parasite code is injected into a process with help PTRACE_SEIZE. Currently crtools blocks signals from a parasite code. If a process has pending signals, crtools wait while a process handles these signals. This method is not suitable for stopped tasks. A stopped task can have a few pending signals, when we will try to execute a parasite code, we will need to drop SIGSTOP, but all other signals must remain pending, because a state of processes must not be changed during checkpointing. This patch adds two ptrace commands to set/get signal-blocked mask. I think gdb can use this commands too. [akpm@linux-foundation.org: be consistent with brace layout] Signed-off-by: Andrey Vagin Reviewed-by: Oleg Nesterov Cc: Roland McGrath Cc: Michael Kerrisk Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 335a7ae697f5..ba5e6cea181a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -844,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_GETSIGMASK: + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + ret = -EFAULT; + else + ret = 0; + + break; + + case PTRACE_SETSIGMASK: { + sigset_t new_set; + + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { + ret = -EFAULT; + break; + } + + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + /* + * Every thread does recalc_sigpending() after resume, so + * retarget_shared_pending() and recalc_sigpending() are not + * called here. + */ + spin_lock_irq(&child->sighand->siglock); + child->blocked = new_set; + spin_unlock_irq(&child->sighand->siglock); + + ret = 0; + break; + } + case PTRACE_INTERRUPT: /* * Stop tracee without any side-effect on signal or job @@ -948,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request, #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: - case PTRACE_SETREGSET: - { + case PTRACE_SETREGSET: { struct iovec kiov; struct iovec __user *uiov = datavp; -- cgit v1.2.3 From 7f57cfa4e2aa29fabe69e41529fd26578adc9b58 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:15 -0700 Subject: usermodehelper: kill the sub_info->path[0] check call_usermodehelper_exec() does nothing but returns success if path[0] == 0. The only user which needs this strange feature is request_module(), it can check modprobe_path[0] itself like other users do if they want to detect the "disabled by admin" case. Kill it. Not only it looks strange, it can confuse other callers. And this allows us to revert 264b83c0 ("usermodehelper: check subprocess_info->path != NULL"), do_execve(NULL) is safe. Signed-off-by: Oleg Nesterov Acked-by: Rusty Russell Cc: Lucas De Marchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8241906c4b61..fb326365b694 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...) */ WARN_ON_ONCE(wait && current_is_async()); + if (!modprobe_path[0]) + return 0; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) int retval = 0; helper_lock(); - if (!sub_info->path) { - retval = -EINVAL; - goto out; - } - - if (sub_info->path[0] == '\0') - goto out; - if (!khelper_wq || usermodehelper_disabled) { retval = -EBUSY; goto out; -- cgit v1.2.3 From 81dabb464139324c005159f5afba377104d8828d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:26 -0700 Subject: exit.c: unexport __set_special_pids() Move __set_special_pids() from exit.c to sys.c close to its single caller and make it static. And rename it to set_special_pids(), another helper with this name has gone away. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 ----------- kernel/sys.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7bb73f9d09db..3a77cd9390a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - - if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); - - if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); -} - /* * Let kernel threads use this to say that they allow a certain signal. * Must not be used if kthread was cloned with CLONE_SIGHAND. diff --git a/kernel/sys.c b/kernel/sys.c index 7bf50dcc6d53..071de900c824 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1309,6 +1309,17 @@ out: return retval; } +static void set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + + if (task_session(curr) != pid) + change_pid(curr, PIDTYPE_SID, pid); + + if (task_pgrp(curr) != pid) + change_pid(curr, PIDTYPE_PGID, pid); +} + SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; @@ -1328,7 +1339,7 @@ SYSCALL_DEFINE0(setsid) goto out; group_leader->signal->leader = 1; - __set_special_pids(sid); + set_special_pids(sid); proc_clear_tty(group_leader); -- cgit v1.2.3 From b57922b6c76c3ee401bb32fd3f298409dd6e6a53 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 3 Jul 2013 15:08:29 -0700 Subject: fork: reorder permissions when violating number of processes limits When a task is attempting to violate the RLIMIT_NPROC limit we have a check to see if the task is sufficiently priviledged. The check first looks at CAP_SYS_ADMIN, then CAP_SYS_RESOURCE, then if the task is uid=0. A result is that tasks which are allowed by the uid=0 check are first checked against the security subsystem. This results in the security subsystem auditting a denial for sys_admin and sys_resource and then the task passing the uid=0 check. This patch rearranges the code to first check uid=0, since if we pass that we shouldn't hit the security system at all. We then check sys_resource, since it is the smallest capability which will solve the problem. Lastly we check the fallback everything cap_sysadmin. We don't want to give this capability many places since it is so powerful. This will eliminate many of the false positive/needless denial messages we get when a root task tries to violate the nproc limit. (note that kthreads count against root, so on a sufficiently large machine we can actually get past the default limits before any userspace tasks are launched.) Signed-off-by: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01b..09dbda38a54b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1199,8 +1199,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != INIT_USER) + if (p->real_cred->user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; -- cgit v1.2.3 From 80628ca06c5d42929de6bc22c0a41589a834d151 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:30 -0700 Subject: kernel/fork.c:copy_process(): unify CLONE_THREAD-or-thread_group_leader code Cleanup and preparation for the next changes. Move the "if (clone_flags & CLONE_THREAD)" code down under "if (likely(p->pid))" and turn it into into the "else" branch. This makes the process/thread initialization more symmetrical and removes one check. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Michal Hocko Cc: Pavel Emelyanov Cc: Sergey Dyasly Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 09dbda38a54b..417cb864e20c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1446,14 +1446,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_free_pid; } - if (clone_flags & CLONE_THREAD) { - current->signal->nr_threads++; - atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); - p->group_leader = current->group_leader; - list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - } - if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1470,6 +1462,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __this_cpu_inc(process_counts); + } else { + current->signal->nr_threads++; + atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); + p->group_leader = current->group_leader; + list_add_tail_rcu(&p->thread_group, + &p->group_leader->thread_group); } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; -- cgit v1.2.3 From 8190773985141f063e1d6dc10200527c655abfb5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:31 -0700 Subject: kernel/fork.c:copy_process(): don't add the uninitialized child to thread/task/pid lists copy_process() adds the new child to thread_group/init_task.tasks list and then does attach_pid(child, PIDTYPE_PID). This means that the lockless next_thread() or next_task() can see this thread with the wrong pid. Say, "ls /proc/pid/task" can list the same inode twice. We could move attach_pid(child, PIDTYPE_PID) up, but in this case find_task_by_vpid() can find the new thread before it was fully initialized. And this is already true for PIDTYPE_PGID/PIDTYPE_SID, With this patch copy_process() initializes child->pids[*].pid first, then calls attach_pid() to insert the task into the pid->tasks list. attach_pid() no longer need the "struct pid*" argument, it is always called after pid_link->pid was already set. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Michal Hocko Cc: Pavel Emelyanov Cc: Sergey Dyasly Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 +++++++++++++--- kernel/pid.c | 12 ++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 417cb864e20c..7d6962fb6156 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1121,6 +1121,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +static inline void +init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) +{ + task->pids[type].pid = pid; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1449,7 +1455,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); + init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); + init_task_pid(p, PIDTYPE_SID, task_session(current)); + if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; @@ -1457,10 +1467,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_PGID); + attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; @@ -1470,7 +1480,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } - attach_pid(p, PIDTYPE_PID, pid); + attach_pid(p, PIDTYPE_PID); nr_threads++; } diff --git a/kernel/pid.c b/kernel/pid.c index 0db3e791a06d..61980cefb1f5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -373,14 +373,10 @@ EXPORT_SYMBOL_GPL(find_vpid); /* * attach_pid() must be called with the tasklist_lock write-held. */ -void attach_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) +void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link; - - link = &task->pids[type]; - link->pid = pid; - hlist_add_head_rcu(&link->node, &pid->tasks[type]); + struct pid_link *link = &task->pids[type]; + hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, @@ -412,7 +408,7 @@ void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); - attach_pid(task, type, pid); + attach_pid(task, type); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ -- cgit v1.2.3 From 18c830df771f2ba8b4699fea9af1492275ae627b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:32 -0700 Subject: kernel/fork.c:copy_process(): consolidate the lockless CLONE_THREAD checks copy_process() does a lot of "chaotic" initializations and checks CLONE_THREAD twice before it takes tasklist. In particular it sets "p->group_leader = p" and then changes it again under tasklist if !thread_group_leader(p). This looks a bit confusing, lets create a single "if (CLONE_THREAD)" block which initializes ->exit_signal, ->group_leader, and ->tgid. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Michal Hocko Cc: Pavel Emelyanov Cc: Sergey Dyasly Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7d6962fb6156..6e6a1c11b3e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1360,11 +1360,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_io; } - p->pid = pid_nr(pid); - p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) - p->tgid = current->tgid; - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1400,12 +1395,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - if (clone_flags & CLONE_THREAD) + p->pid = pid_nr(pid); + if (clone_flags & CLONE_THREAD) { p->exit_signal = -1; - else if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = (clone_flags & CSIGNAL); + p->group_leader = current->group_leader; + p->tgid = current->tgid; + } else { + if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->group_leader = p; + p->tgid = p->pid; + } p->pdeath_signal = 0; p->exit_state = 0; @@ -1414,15 +1416,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; - /* - * Ok, make it visible to the rest of the system. - * We dont wake it up yet. - */ - p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Need tasklist lock for parent etc handling! */ + /* + * Make it visible to the rest of the system, but dont wake it up yet. + * Need tasklist lock for parent etc handling! + */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ @@ -1476,7 +1476,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } -- cgit v1.2.3 From 8f75af44eed0c81f818b5b345023cebfe5209400 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Wed, 3 Jul 2013 15:09:02 -0700 Subject: kernel/pid.c: move statement Move statement to static initilization of init_pid_ns. Signed-off-by: Raphael S. Carvalho Cc: "Eric W. Biederman" Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 61980cefb1f5..66505c1dfc51 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, .last_pid = 0, + .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -590,7 +591,6 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); -- cgit v1.2.3 From 0786f7b225ba1edd801dc4bfbf6191d058b943a2 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 3 Jul 2013 15:09:16 -0700 Subject: kernel/resource.c: remove the unneeded assignment in function __find_resource This line was introduced by fcb11918 ("resources: add arch hook for preventing allocation in reserved areas"). But the struct tmp was already assigned to *new in the above line, so this seems superfluous. Just remove it. Signed-off-by: Kevin Hao Cc: Bjorn Helgaas Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 77bf11a86c7d..3f285dce9347 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -449,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old, struct resource *this = root->child; struct resource tmp = *new, avail, alloc; - tmp.flags = new->flags; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment -- cgit v1.2.3 From fa18f7bde3ad4568d1d343b60d963bfbd8dc3991 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 26 May 2013 17:35:41 -0400 Subject: posix-cpu-timers: don't account cpu timer after stopped thread runtime accounting When tsk->signal->cputimer->running is 1, signal->cputimer (i.e. per process timer account) and tsk->sum_sched_runtime (i.e. per thread timer account) increase at the same pace because update_curr() increases both accounting. However, there is one exception. When thread exiting, __exit_signal() turns over task's sum_shced_runtime to sig->sum_sched_runtime, but it doesn't stop signal->cputimer accounting. This inconsistency makes POSIX timer wake up too early. This patch fixes it. Original-patch-by: Olivier Langlois Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Olivier Langlois Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- kernel/sched/stats.h | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..71bac979d5ee 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -161,6 +161,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) * on CONFIG_SCHEDSTATS. */ +/** + * cputimer_running - return true if cputimer is running + * + * @tsk: Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return false; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return false; + + return true; +} + /** * account_group_user_time - Maintain utime for a thread group. * @@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); -- cgit v1.2.3 From e5302920da9ef23f9d19d4e9ac85704cc25bee7a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 5 Jul 2013 00:30:11 +0200 Subject: perf: Fix interrupt handler timing harness This patch fixes a serious bug in: 14c63f17b1fd perf: Drop sample rate when sampling is too slow There was an misunderstanding on the API of the do_div() macro. It returns the remainder of the division and this was not what the function expected leading to disabling the interrupt latency watchdog. This patch also remove a duplicate assignment in perf_sample_event_took(). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: dave.hansen@linux.intel.com Cc: ak@linux.intel.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/20130704223010.GA30625@quad Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1db3af933704..1833bc5a84a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -182,7 +182,7 @@ void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - tmp = do_div(tmp, 100); + do_div(tmp, 100); atomic_set(&perf_sample_allowed_ns, tmp); } @@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; - u64 local_samples_len = __get_cpu_var(running_sample_length); + u64 local_samples_len; if (atomic_read(&perf_sample_allowed_ns) == 0) return; -- cgit v1.2.3 From 332962f2c88868ed3cdab466870baaa34dd58612 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Jul 2013 22:46:45 +0200 Subject: clocksource: Reselect clocksource when watchdog validated high-res capability Up to commit 5d33b883a (clocksource: Always verify highres capability) we had no sanity check when selecting a clocksource, which prevented that a non highres capable clocksource is used when the system already switched to highres/nohz mode. The new sanity check works as Alex and Tim found out. It prevents the TSC from being used. This happens because on x86 the boot process looks like this: tsc_start_freqency_validation(TSC); clocksource_register(HPET); clocksource_done_booting(); clocksource_select() Selects HPET which is valid for high-res switch_to_highres(); clocksource_register(TSC); TSC is not selected, because it is not yet flagged as VALID_HIGH_RES clocksource_watchdog() Validates TSC for highres, but that does not make TSC the current clocksource. Before the sanity check was added, we installed TSC unvalidated which worked most of the time. If the TSC was really detected as unstable, then the unstable logic removed it and installed HPET again. The sanity check is correct and needed. So the watchdog needs to kick a reselection of the clocksource, when it qualifies TSC as a valid high res clocksource. To solve this, we mark the clocksource which got the flag CLOCK_SOURCE_VALID_FOR_HRES set by the watchdog with an new flag CLOCK_SOURCE_RESELECT and trigger the watchdog thread. The watchdog thread evaluates the flag and invokes clocksource_select() when set. To avoid that the clocksource_done_booting() code, which is about to install the first real clocksource anyway, needs to go through clocksource_select and tick_oneshot_notify() pointlessly, split out the clocksource_watchdog_kthread() list walk code and invoke the select/notify only when called from clocksource_watchdog_kthread(). So clocksource_done_booting() can utilize the same splitout code without the select/notify invocation and the clocksource_mutex unlock/relock dance. Reported-and-tested-by: Alex Shi Cc: Hans Peter Anvin Cc: Tim Chen Cc: Andi Kleen Tested-by: Peter Zijlstra Cc: Ingo Molnar Cc: Davidlohr Bueso Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1307042239150.11637@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 57 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e713ef7d19a7..50a8736757f3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -181,6 +181,7 @@ static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -301,13 +302,30 @@ static void clocksource_watchdog(unsigned long data) if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + /* - * We just marked the clocksource as highres-capable, - * notify the rest of the system as well so that we - * transition into high-res mode: + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. */ - tick_clock_notify(); + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } } } @@ -404,19 +422,25 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; LIST_HEAD(unstable); + int select = 0; - mutex_lock(&clocksource_mutex); spin_lock_irqsave(&watchdog_lock, flags); - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); list_add(&cs->wd_list, &unstable); + select = 1; + } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; } + } /* Check if the watchdog timer needs to be stopped. */ clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -426,6 +450,14 @@ static int clocksource_watchdog_kthread(void *data) list_del_init(&cs->wd_list); __clocksource_change_rating(cs, 0); } + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -445,7 +477,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -647,16 +679,11 @@ static int __init clocksource_done_booting(void) { mutex_lock(&clocksource_mutex); curr_clocksource = clocksource_default_clock(); - mutex_unlock(&clocksource_mutex); - finished_booting = 1; - /* * Run the watchdog first to eliminate unstable clock sources */ - clocksource_watchdog_kthread(NULL); - - mutex_lock(&clocksource_mutex); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -789,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - clocksource_select(); } /** @@ -801,6 +827,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); + clocksource_select(); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); -- cgit v1.2.3 From 002fca5df168922103a2bb52748f9984e6de80b2 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Fri, 5 Jul 2013 17:13:12 +0800 Subject: genirq: generic chip: Use DIV_ROUND_UP to calculate numchips The number of interrupts in a domain may be not divisible by the number of interrupts each chip handles. Integer division may truncate the result, thus use DIV_ROUND_UP to count numchips. Seems all users of irq_alloc_domain_generic_chips() in current code do not have this issue. I just found the issue while reading the code. Signed-off-by: Axel Lin Cc: Grant Likely Cc: Tony Lindgren Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1373015592.18252.2.camel@phoenix Signed-off-by: Thomas Gleixner --- kernel/irq/generic-chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 1c39eccc1eaf..2f274f30b7e2 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -278,7 +278,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) return -EINVAL; - numchips = d->revmap_data.linear.size / irqs_per_chip; + numchips = DIV_ROUND_UP(d->revmap_data.linear.size, irqs_per_chip); if (!numchips) return -EINVAL; -- cgit v1.2.3 From 5ec2481b7b47a4005bb446d176e5d0257400c77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 5 Jul 2013 12:09:18 +0200 Subject: hrtimers: Move SMP function call to thread context smp_call_function_* must not be called from softirq context. But clock_was_set() which calls on_each_cpu() is called from softirq context to implement a delayed clock_was_set() for the timer interrupt handler. Though that almost never gets invoked. A recent change in the resume code uses the softirq based delayed clock_was_set to support Xens resume mechanism. linux-next contains a new warning which warns if smp_call_function_* is called from softirq context which gets triggered by that Xen change. Fix this by moving the delayed clock_was_set() call to a work context. Reported-and-tested-by: Artem Savkov Reported-by: Sasha Levin Cc: David Vrabel Cc: Ingo Molnar Cc: H. Peter Anvin , Cc: Konrad Wilk Cc: John Stultz Cc: xen-devel@lists.xen.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e86827e94c9a..b9b9420a1297 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -721,17 +721,20 @@ static int hrtimer_switch_to_hres(void) return 1; } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + /* - * Called from timekeeping code to reprogramm the hrtimer interrupt - * device. If called from the timer interrupt context we defer it to - * softirq context. + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus. */ void clock_was_set_delayed(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - cpu_base->clock_was_set = 1; - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + schedule_work(&hrtimer_work); } #else @@ -775,12 +778,7 @@ void clock_was_set(void) * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred to the softirq. - * - * The one-shot timer has already been programmed to fire immediately - * (see tick_resume_oneshot()) and this interrupt will trigger the - * softirq to run early enough to correctly reprogram the timers on - * all CPUs. + * must be deferred. */ void hrtimers_resume(void) { @@ -789,8 +787,10 @@ void hrtimers_resume(void) WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); - cpu_base->clock_was_set = 1; - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); } static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) @@ -1441,13 +1441,6 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - if (cpu_base->clock_was_set) { - cpu_base->clock_was_set = 0; - clock_was_set(); - } - hrtimer_peek_ahead_timers(); } -- cgit v1.2.3 From 73b0cd674ccc64c921e25bd7154f26d342116539 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Jul 2013 10:34:00 +0200 Subject: hrtimer: Remove unused variable Sigh, should have noticed myself. Reported-by: fengguang.wu@intel.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b9b9420a1297..3a951d8d5770 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -782,8 +782,6 @@ void clock_was_set(void) */ void hrtimers_resume(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); -- cgit v1.2.3 From 79f6530cb59e2a0af6953742a33cc29e98ca631c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2013 15:59:36 -0700 Subject: audit: fix mq_open and mq_unlink to add the MQ root as a hidden parent audit_names record The old audit PATH records for mq_open looked like this: type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 ...with the audit related changes that went into 3.7, they now look like this: type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq" Both of these look wrong to me. As Steve Grubb pointed out: "What we need is 1 PATH record that identifies the MQ. The other PATH records probably should not be there." Fix it to record the mq root as a parent, and flag it such that it should be hidden from view when the names are logged, since the root of the mq filesystem isn't terribly interesting. With this change, we get a single PATH record that looks more like this: type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914 dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmpfs_t:s0 In order to do this, a new audit_inode_parent_hidden() function is added. If we do it this way, then we avoid having the existing callers of audit_inode needing to do any sort of flag conversion if auditing is inactive. Signed-off-by: Jeff Layton Reported-by: Jiri Jaburek Cc: Steve Grubb Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; -- cgit v1.2.3 From 6beb8a23b50d38a003e80c5f16b50c56e8ae3387 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Mon, 8 Jul 2013 15:59:37 -0700 Subject: kernel/auditfilter.c: fixing build warning kernel/auditfilter.c:426: warning: this decimal constant is unsigned only in ISO C90 Signed-off-by: Raphael S. Carvalho Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..0ee9eff866d6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v1.2.3 From 2f992ee85aaa7dfd2bda43efe4493af1e108d054 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Jul 2013 15:59:38 -0700 Subject: kernel/auditfilter.c: fix leak in audit_add_rule() error path If both 'tree' and 'watch' are valid we must call audit_put_tree(), just like the preceding code within audit_add_rule(). Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0ee9eff866d6..3d15c66b7f0b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } -- cgit v1.2.3 From b9ce54c9f59894e787e3067d2f758c297fcd6fd0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 8 Jul 2013 15:59:39 -0700 Subject: audit: Fix decimal constant description Use proper decimal type for comparison with u32. Compilation warning was introduced by 780a7654 ("audit: Make testing for a valid loginuid explicit.") kernel/auditfilter.c: In function 'audit_data_to_entry': kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only in ISO C90 [enabled by default] if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { Signed-off-by: Michal Simek Cc: Al Viro Cc: Eric Paris Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3d15c66b7f0b..f7aee8be7fb2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v1.2.3 From dcb6b45254e2281b6f99ea7f2d51343954aa3ba8 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 8 Jul 2013 16:00:42 -0700 Subject: panic: add cpu/pid to warn_slowpath_common in WARNING printk()s Add the cpu/pid that called WARN() so that the stack traces can be matched up with the WARNING messages. [akpm@linux-foundation.org: remove stray quote] Signed-off-by: Alex Thorlton Reviewed-by: Robin Holt Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..97712319f128 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); -- cgit v1.2.3 From 7c8df28633bf0b7eb253f866029be0ac59ddb062 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:54 -0700 Subject: ptrace: revert "Prepare to fix racy accesses on task breakpoints" This reverts commit bf26c018490c ("Prepare to fix racy accesses on task breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers, we can kill them and remove task->ptrace_bp_refcnt. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/ptrace.c | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fafe75d9e6f6..a949819055d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -808,7 +808,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ba5e6cea181a..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.2.3 From fab840fc2d542fabcab903db8e03589a6702ba5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:05 -0700 Subject: ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child) Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child). This frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this ensures that the tracee won't be killed by SIGTRAP triggered by the active breakpoints. Test-case: unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len) { unsigned long dr7; dr7 = ((len | type) & 0xf) << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); if (enable) dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return dr7; } int write_dr(int pid, int dr, unsigned long val) { return ptrace(PTRACE_POKEUSER, pid, offsetof (struct user, u_debugreg[dr]), val); } void func(void) { } int main(void) { int pid, stat; unsigned long dr7; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); kill(getpid(), SIGHUP); func(); return 0x13; } assert(pid == waitpid(-1, &stat, 0)); assert(WSTOPSIG(stat) == SIGHUP); assert(write_dr(pid, 0, (long)func) == 0); dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1); assert(write_dr(pid, 7, dr7) == 0); assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0); assert(pid == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Before this patch the child is killed after PTRACE_DETACH. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..4041f5747e73 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* -- cgit v1.2.3 From 0efbee70890c992f31a7b294ac654ff6c62d51c5 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:31 -0700 Subject: reboot: remove -stable friendly PF_THREAD_BOUND define Remove the prior patch's #define for easier backporting to the stable releases. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 071de900c824..b882440bd0c0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,11 +362,6 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY PF_THREAD_BOUND -#endif - static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ -- cgit v1.2.3 From 15d94b82565ebfb0cf27830b96e6cf5ed2d12a9a Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:32 -0700 Subject: reboot: move shutdown/reboot related functions to kernel/reboot.c This patch is preparatory. It moves reboot related syscall, etc functions from kernel/sys.c to kernel/reboot.c. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/reboot.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 331 ----------------------------------------------------- 3 files changed, 347 insertions(+), 332 deletions(-) create mode 100644 kernel/reboot.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af9..470839d1a30e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ + notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 000000000000..37d2636a65c2 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/reboot.c + * + * Copyright (C) 2013 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); +} + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + migrate_to_reboot_cpu(); + syscore_shutdown(); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(pid_ns, cmd); + if (ret) + return ret; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ + char **argv; + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret; + + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + ret = -ENOMEM; + } + + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sys.c b/kernel/sys.c index b882440bd0c0..771129b299f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -115,20 +115,6 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); -/* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. @@ -308,261 +294,6 @@ out_unlock: return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -static void migrate_to_reboot_cpu(void) -{ - /* The boot cpu is always logical cpu 0 */ - int cpu = 0; - - cpu_hotplug_disable(); - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(cpu)) - cpu = cpumask_first(cpu_online_mask); - - /* Prevent races with other tasks migrating this task */ - current->flags |= PF_NO_SETAFFINITY; - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - migrate_to_reboot_cpu(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt.\n"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -2287,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill -- cgit v1.2.3 From 972ee83df88a7fd84c228a31b4f9611299898984 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:34 -0700 Subject: reboot: checkpatch.pl the new kernel/reboot.c file Get the new file to pass scripts/checkpatch.pl Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 37d2636a65c2..abb6a0483716 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -4,6 +4,8 @@ * Copyright (C) 2013 Linus Torvalds */ +#define pr_fmt(fmt) "reboot: " fmt + #include #include #include @@ -114,9 +116,9 @@ void kernel_restart(char *cmd) migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); + pr_emerg("Restarting system\n"); else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + pr_emerg("Restarting system with command '%s'\n", cmd); kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } @@ -125,7 +127,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; usermodehelper_disable(); device_shutdown(); @@ -140,11 +142,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } - EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -159,7 +160,7 @@ void kernel_power_off(void) pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); + pr_emerg("Power down\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } @@ -188,10 +189,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* For safety, we require "magic" arguments. */ if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) + magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; /* @@ -234,7 +235,8 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); + if (ret < 0) { ret = -EFAULT; break; } @@ -300,14 +302,11 @@ static int __orderly_poweroff(bool force) ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); ret = -ENOMEM; } if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); + pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an -- cgit v1.2.3 From 1b3a5d02ee070c8f9943333b9b6370f486601e0f Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:42 -0700 Subject: reboot: move arch/x86 reboot= handling to generic kernel Merge together the unicore32, arm, and x86 reboot= command line parameter handling. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russell King Cc: Guan Xuetao Cc: Russ Anderson Cc: Robin Holt Acked-by: Ingo Molnar Acked-by: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index abb6a0483716..269ed9384cc4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) "reboot: " fmt +#include #include #include #include @@ -24,6 +25,18 @@ int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); +#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#define DEFAULT_REBOOT_MODE = REBOOT_HARD +#else +#define DEFAULT_REBOOT_MODE +#endif +enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; + +int reboot_default; +int reboot_cpu; +enum reboot_type reboot_type = BOOT_ACPI; +int reboot_force; + /* * If set, this is used for preparing the system to power off. */ @@ -87,7 +100,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ - int cpu = 0; + int cpu = reboot_cpu; cpu_hotplug_disable(); @@ -343,3 +356,64 @@ int orderly_poweroff(bool force) return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); + +static int __init reboot_setup(char *str) +{ + for (;;) { + /* + * Having anything passed on the command line via + * reboot= will cause us to disable DMI checking + * below. + */ + reboot_default = 0; + + switch (*str) { + case 'w': + reboot_mode = REBOOT_WARM; + break; + + case 'c': + reboot_mode = REBOOT_COLD; + break; + + case 'h': + reboot_mode = REBOOT_HARD; + break; + + case 's': + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else + reboot_mode = REBOOT_SOFT; + break; + + case 'g': + reboot_mode = REBOOT_GPIO; + break; + + case 'b': + case 'a': + case 'k': + case 't': + case 'e': + case 'p': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; + } + + str = strchr(str, ','); + if (str) + str++; + else + break; + } + return 1; +} +__setup("reboot=", reboot_setup); -- cgit v1.2.3 From 98d1e64f95b177d0f14efbdf695a1b28e1428035 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 10 Jul 2013 16:05:12 -0700 Subject: mm: remove free_area_cache Since all architectures have been converted to use vm_unmapped_area(), there is no remaining use for the free_area_cache. Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Cc: "James E.J. Bottomley" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Paul Mackerras Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6e6a1c11b3e5..66635c80a813 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); -- cgit v1.2.3 From 734df5ab549ca44f40de0f07af1c8803856dfb18 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 9 Jul 2013 17:44:10 +0200 Subject: perf: Clone child context from parent context pmu Currently when the child context for inherited events is created, it's based on the pmu object of the first event of the parent context. This is wrong for the following scenario: - HW context having HW and SW event - HW event got removed (closed) - SW event stays in HW context as the only event and its pmu is used to clone the child context The issue starts when the cpu context object is touched based on the pmu context object (__get_cpu_context). In this case the HW context will work with SW cpu context ending up with following WARN below. Fixing this by using parent context pmu object to clone from child context. Addresses the following warning reported by Vince Weaver: [ 2716.472065] ------------[ cut here ]------------ [ 2716.476035] WARNING: at kernel/events/core.c:2122 task_ctx_sched_out+0x3c/0x) [ 2716.476035] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl nfs locn [ 2716.476035] CPU: 0 PID: 3164 Comm: perf_fuzzer Not tainted 3.10.0-rc4 #2 [ 2716.476035] Hardware name: AOpen DE7000/nMCP7ALPx-DE R1.06 Oct.19.2012, BI2 [ 2716.476035] 0000000000000000 ffffffff8102e215 0000000000000000 ffff88011fc18 [ 2716.476035] ffff8801175557f0 0000000000000000 ffff880119fda88c ffffffff810ad [ 2716.476035] ffff880119fda880 ffffffff810af02a 0000000000000009 ffff880117550 [ 2716.476035] Call Trace: [ 2716.476035] [] ? warn_slowpath_common+0x5b/0x70 [ 2716.476035] [] ? task_ctx_sched_out+0x3c/0x5f [ 2716.476035] [] ? perf_event_exit_task+0xbf/0x194 [ 2716.476035] [] ? do_exit+0x3e7/0x90c [ 2716.476035] [] ? __do_fault+0x359/0x394 [ 2716.476035] [] ? do_group_exit+0x66/0x98 [ 2716.476035] [] ? get_signal_to_deliver+0x479/0x4ad [ 2716.476035] [] ? __perf_event_task_sched_out+0x230/0x2d1 [ 2716.476035] [] ? do_signal+0x3c/0x432 [ 2716.476035] [] ? ctx_sched_in+0x43/0x141 [ 2716.476035] [] ? perf_event_context_sched_in+0x7a/0x90 [ 2716.476035] [] ? __perf_event_task_sched_in+0x31/0x118 [ 2716.476035] [] ? mmdrop+0xd/0x1c [ 2716.476035] [] ? finish_task_switch+0x7d/0xa6 [ 2716.476035] [] ? do_notify_resume+0x20/0x5d [ 2716.476035] [] ? retint_signal+0x3d/0x78 [ 2716.476035] ---[ end trace 827178d8a5966c3d ]--- Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373384651-6109-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1833bc5a84a7..1d1f030e2f1e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7465,7 +7465,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = alloc_perf_context(event->pmu, child); + child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; -- cgit v1.2.3 From 06f417968beac6e6b614e17b37d347aa6a6b1d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 9 Jul 2013 17:44:11 +0200 Subject: perf: Remove WARN_ON_ONCE() check in __perf_event_enable() for valid scenario The '!ctx->is_active' check has a valid scenario, so there's no need for the warning. The reason is that there's a time window between the 'ctx->is_active' check in the perf_event_enable() function and the __perf_event_enable() function having: - IRQs on - ctx->lock unlocked where the task could be killed and 'ctx' deactivated by perf_event_exit_task(), ending up with the warning below. So remove the WARN_ON_ONCE() check and add comments to explain it all. This addresses the following warning reported by Vince Weaver: [ 324.983534] ------------[ cut here ]------------ [ 324.984420] WARNING: at kernel/events/core.c:1953 __perf_event_enable+0x187/0x190() [ 324.984420] Modules linked in: [ 324.984420] CPU: 19 PID: 2715 Comm: nmi_bug_snb Not tainted 3.10.0+ #246 [ 324.984420] Hardware name: Supermicro X8DTN/X8DTN, BIOS 4.6.3 01/08/2010 [ 324.984420] 0000000000000009 ffff88043fce3ec8 ffffffff8160ea0b ffff88043fce3f00 [ 324.984420] ffffffff81080ff0 ffff8802314fdc00 ffff880231a8f800 ffff88043fcf7860 [ 324.984420] 0000000000000286 ffff880231a8f800 ffff88043fce3f10 ffffffff8108103a [ 324.984420] Call Trace: [ 324.984420] [] dump_stack+0x19/0x1b [ 324.984420] [] warn_slowpath_common+0x70/0xa0 [ 324.984420] [] warn_slowpath_null+0x1a/0x20 [ 324.984420] [] __perf_event_enable+0x187/0x190 [ 324.984420] [] remote_function+0x40/0x50 [ 324.984420] [] generic_smp_call_function_single_interrupt+0xbe/0x130 [ 324.984420] [] smp_call_function_single_interrupt+0x27/0x40 [ 324.984420] [] call_function_single_interrupt+0x6f/0x80 [ 324.984420] [] ? _raw_spin_unlock_irqrestore+0x41/0x70 [ 324.984420] [] perf_event_exit_task+0x14d/0x210 [ 324.984420] [] ? switch_task_namespaces+0x24/0x60 [ 324.984420] [] do_exit+0x2b6/0xa40 [ 324.984420] [] ? _raw_spin_unlock_irq+0x2c/0x30 [ 324.984420] [] do_group_exit+0x49/0xc0 [ 324.984420] [] get_signal_to_deliver+0x254/0x620 [ 324.984420] [] do_signal+0x57/0x5a0 [ 324.984420] [] ? __do_page_fault+0x2a4/0x4e0 [ 324.984420] [] ? retint_restore_args+0xe/0xe [ 324.984420] [] ? retint_signal+0x11/0x84 [ 324.984420] [] do_notify_resume+0x65/0x80 [ 324.984420] [] retint_signal+0x46/0x84 [ 324.984420] ---[ end trace 442ec2f04db3771a ]--- Reported-by: Vince Weaver Signed-off-by: Jiri Olsa Suggested-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373384651-6109-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d1f030e2f1e..ef5e7cc686e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1950,7 +1950,16 @@ static int __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - if (WARN_ON_ONCE(!ctx->is_active)) + /* + * There's a time window between 'ctx->is_active' check + * in perf_event_enable function and this place having: + * - IRQs on + * - ctx->lock unlocked + * + * where the task could be killed and 'ctx' deactivated + * by perf_event_exit_task. + */ + if (!ctx->is_active) return -EINVAL; raw_spin_lock(&ctx->lock); -- cgit v1.2.3 From 058ebd0eba3aff16b144eabf4510ed9510e1416e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 12 Jul 2013 11:08:33 +0200 Subject: perf: Fix perf_lock_task_context() vs RCU Jiri managed to trigger this warning: [] ====================================================== [] [ INFO: possible circular locking dependency detected ] [] 3.10.0+ #228 Tainted: G W [] ------------------------------------------------------- [] p/6613 is trying to acquire lock: [] (rcu_node_0){..-...}, at: [] rcu_read_unlock_special+0xa7/0x250 [] [] but task is already holding lock: [] (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0xd9/0x2c0 [] [] which lock already depends on the new lock. [] [] the existing dependency chain (in reverse order) is: [] [] -> #4 (&ctx->lock){-.-...}: [] -> #3 (&rq->lock){-.-.-.}: [] -> #2 (&p->pi_lock){-.-.-.}: [] -> #1 (&rnp->nocb_gp_wq[1]){......}: [] -> #0 (rcu_node_0){..-...}: Paul was quick to explain that due to preemptible RCU we cannot call rcu_read_unlock() while holding scheduler (or nested) locks when part of the read side critical section was preemptible. Therefore solve it by making the entire RCU read side non-preemptible. Also pull out the retry from under the non-preempt to play nice with RT. Reported-by: Jiri Olsa Helped-out-by: Paul E. McKenney Cc: Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ef5e7cc686e3..eba8fb5834ae 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -947,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; - rcu_read_lock(); retry: + /* + * One of the few rules of preemptible RCU is that one cannot do + * rcu_read_unlock() while holding a scheduler (or nested) lock when + * part of the read side critical section was preemptible -- see + * rcu_read_unlock_special(). + * + * Since ctx->lock nests under rq->lock we must ensure the entire read + * side critical section is non-preemptible. + */ + preempt_disable(); + rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* @@ -964,6 +974,8 @@ retry: raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); + rcu_read_unlock(); + preempt_enable(); goto retry; } @@ -973,6 +985,7 @@ retry: } } rcu_read_unlock(); + preempt_enable(); return ctx; } -- cgit v1.2.3 From 1b375dc30710180c4b88cc59caba6e3481ec5c8b Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 5 Jul 2013 09:29:32 +0200 Subject: mutex: Move ww_mutex definitions to ww_mutex.h Move the definitions for wound/wait mutexes out to a separate header, ww_mutex.h. This reduces clutter in mutex.h, and increases readability. Suggested-by: Linus Torvalds Signed-off-by: Maarten Lankhorst Acked-by: Peter Zijlstra Acked-by: Rik van Riel Acked-by: Maarten Lankhorst Cc: Dave Airlie Link: http://lkml.kernel.org/r/51D675DC.3000907@canonical.com [ Tidied up the code a bit. ] Signed-off-by: Ingo Molnar --- kernel/mutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index e581ada5faf4..ff05f4bd86eb 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -18,6 +18,7 @@ * Also see Documentation/mutex-design.txt. */ #include +#include #include #include #include -- cgit v1.2.3 From a272dcca1802a7e265a56e60b0d0a6715b0a8ac2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 11 Jul 2013 07:00:59 -0700 Subject: tick: broadcast: Check broadcast mode on CPU hotplug On ARM systems the dummy clockevent is registered with the cpu hotplug notifier chain before any other per-cpu clockevent. This has the side-effect of causing the dummy clockevent to be registered first in every hotplug sequence. Because the dummy is first, we'll try to turn the broadcast source on but the code in tick_device_uses_broadcast() assumes the broadcast source is in periodic mode and calls tick_broadcast_start_periodic() unconditionally. On boot this isn't a problem because we typically haven't switched into oneshot mode yet (if at all). During hotplug, if the broadcast source isn't in periodic mode we'll replace the broadcast oneshot handler with the broadcast periodic handler and start emulating oneshot mode when we shouldn't. Due to the way the broadcast oneshot handler programs the next_event it's possible for it to contain KTIME_MAX and cause us to hang the system when the periodic handler tries to program the next tick. Fix this by using the appropriate function to start the broadcast source. Reported-by: Stephen Warren Tested-by: Stephen Warren Signed-off-by: Stephen Boyd Cc: Mark Rutland Cc: Marc Zyngier Cc: ARM kernel mailing list Cc: John Stultz Cc: Joseph Lo Link: http://lkml.kernel.org/r/20130711140059.GA27430@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 6d3f91631de6..218bcb565fed 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -157,7 +157,10 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(bc); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); ret = 1; } else { /* -- cgit v1.2.3 From 971ee28cbd1ccd87b3164facd9359a534c1d2892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Jun 2013 11:18:53 +0200 Subject: sched: Fix HRTICK David reported that the HRTICK sched feature was borken; which was enough motivation for me to finally fix it ;-) We should not allow hrtimer code to do softirq wakeups while holding scheduler locks. The hrtimer code only needs this when we accidentally try to program an expired time. We don't much care about those anyway since we have the regular tick to fall back to. Reported-by: David Ahern Tested-by: David Ahern Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130628091853.GE29209@dyad.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e533b95..0d8eb4525e76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void) #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. */ static void hrtick_clear(struct rq *rq) @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = hrtimer_get_softexpires(timer); + + return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} + /* * called from hardirq (IPI) context */ @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg) struct rq *rq = arg; raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); + __hrtick_restart(rq); rq->hrtick_csd_pending = 0; raw_spin_unlock(&rq->lock); } @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); if (rq == this_rq()) { - hrtimer_restart(timer); + __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; -- cgit v1.2.3 From 913ffdb54366f94eec65c656cae8c6e00e1ab1b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 11 Jul 2013 16:34:48 -0700 Subject: cgroup: replace task_cgroup_path_from_hierarchy() with task_cgroup_path() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit task_cgroup_path_from_hierarchy() was added for the planned new users and none of the currently planned users wants to know about multiple hierarchies. This patch drops the multiple hierarchy part and makes it always return the path in the first non-dummy hierarchy. As unified hierarchy will always have id 1, this is guaranteed to return the path for the unified hierarchy if mounted; otherwise, it will return the path from the hierarchy which happens to occupy the lowest hierarchy id, which will usually be the first hierarchy mounted after boot. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Lennart Poettering Cc: Kay Sievers Cc: Jan Kaluža --- kernel/cgroup.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..afb8d53ca6c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1846,36 +1846,43 @@ out: EXPORT_SYMBOL_GPL(cgroup_path); /** - * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task - * @hierarchy_id: the hierarchy to look up @task's cgroup from * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and - * copy its path into @buf. This function grabs cgroup_mutex and shouldn't - * be used inside locks used by cgroup controller callbacks. + * Determine @task's cgroup on the first (the one with the lowest non-zero + * hierarchy_id) cgroup hierarchy and copy its path into @buf. This + * function grabs cgroup_mutex and shouldn't be used inside locks used by + * cgroup controller callbacks. + * + * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. */ -int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, - char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; - struct cgroup *cgrp = NULL; - int ret = -ENOENT; + struct cgroup *cgrp; + int hierarchy_id = 1, ret = 0; + + if (buflen < 2) + return -ENAMETOOLONG; mutex_lock(&cgroup_mutex); - root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); + root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); + if (root) { cgrp = task_cgroup_from_root(task, root); ret = cgroup_path(cgrp, buf, buflen); + } else { + /* if no hierarchy exists, everyone is in "/" */ + memcpy(buf, "/", 2); } mutex_unlock(&cgroup_mutex); - return ret; } -EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); +EXPORT_SYMBOL_GPL(task_cgroup_path); /* * Control Group taskset -- cgit v1.2.3 From 786e1448d9c5d2a469bcc9d2aecacd418ee1aca0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jul 2013 17:50:23 +0400 Subject: cgroup: we can use simple_lookup() now Signed-off-by: Al Viro --- kernel/cgroup.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..0e0b20b8c5db 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -802,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, unsigned long subsys_mask); @@ -2642,7 +2641,7 @@ static const struct inode_operations cgroup_file_inode_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, + .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2652,14 +2651,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -{ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; -} - /* * Check if a file is a control file */ -- cgit v1.2.3 From e5248a111bf4048a9f3fab1a9c94c4630a10592a Mon Sep 17 00:00:00 2001 From: Liu ShuoX Date: Thu, 11 Jul 2013 16:03:45 +0800 Subject: PM / Sleep: avoid 'autosleep' in shutdown progress Prevent automatic system suspend from happening during system shutdown by making try_to_suspend() check system_state and return immediately if it is not SYSTEM_RUNNING. This prevents the following breakage from happening (scenario from Zhang Yanmin): Kernel starts shutdown and calls all device driver's shutdown callback. When a driver's shutdown is called, the last wakelock is released and suspend-to-ram starts. However, as some driver's shut down callbacks already shut down devices and disabled runtime pm, the suspend-to-ram calls driver's suspend callback without noticing that device is already off and causes crash. [rjw: Changelog] Signed-off-by: Liu ShuoX Cc: 3.5+ Signed-off-by: Rafael J. Wysocki --- kernel/power/autosleep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9a..9012ecf7b814 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work) mutex_lock(&autosleep_lock); - if (!pm_save_wakeup_count(initial_count)) { + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { mutex_unlock(&autosleep_lock); goto out; } -- cgit v1.2.3 From 49fb4c6290c70c418a5c25eee996d6b55ea132d6 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Jun 2013 14:52:21 -0400 Subject: rcu: delete __cpuinit usage from all rcu files The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the drivers/rcu uses of the __cpuinit macros from all C files. [1] https://lkml.org/lkml/2013/5/20/589 Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Dipankar Sarma Reviewed-by: Josh Triplett Signed-off-by: Paul Gortmaker --- kernel/rcutorture.c | 6 +++--- kernel/rcutree.c | 6 +++--- kernel/rcutree.h | 4 ++-- kernel/rcutree_plugin.h | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b1fa5510388d..f4871e52c546 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1476,7 +1476,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int __cpuinit +static int rcu_torture_onoff(void *arg) { int cpu; @@ -1558,7 +1558,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int __cpuinit +static int rcu_torture_onoff_init(void) { int ret; @@ -1601,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void) * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. */ -static int __cpuinit rcu_torture_stall(void *args) +static int rcu_torture_stall(void *args) { unsigned long stop_at; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e08abb9461ac..068de3a93606 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2910,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * can accept some slop in the rsp->completed access due to the fact * that this CPU cannot possibly have any RCU callbacks in flight yet. */ -static void __cpuinit +static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; @@ -2962,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) mutex_unlock(&rsp->onoff_mutex); } -static void __cpuinit rcu_prepare_cpu(int cpu) +static void rcu_prepare_cpu(int cpu) { struct rcu_state *rsp; @@ -2974,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4a39d364493c..b3832581043c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -521,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63098a59216e..769e12e3151b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1352,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; @@ -1507,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void) } early_initcall(rcu_spawn_kthreads); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; @@ -1549,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void) } early_initcall(rcu_scheduler_really_started); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { } -- cgit v1.2.3 From 0db0628d90125193280eabb501c94feaf48fa9ab Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Jun 2013 14:53:51 -0400 Subject: kernel: delete __cpuinit usage from all core kernel files The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the uses of the __cpuinit macros from C files in the core kernel directories (kernel, init, lib, mm, and include) that don't really have a specific maintainer. [1] https://lkml.org/lkml/2013/5/20/589 Signed-off-by: Paul Gortmaker --- kernel/cpu.c | 6 +++--- kernel/events/core.c | 4 ++-- kernel/fork.c | 2 +- kernel/hrtimer.c | 6 +++--- kernel/printk.c | 2 +- kernel/profile.c | 2 +- kernel/relay.c | 2 +- kernel/sched/core.c | 12 ++++++------ kernel/sched/fair.c | 2 +- kernel/smp.c | 2 +- kernel/smpboot.c | 2 +- kernel/softirq.c | 8 ++++---- kernel/time/tick-sched.c | 2 +- kernel/timer.c | 10 +++++----- kernel/workqueue.c | 4 ++-- 15 files changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 198a38883e64..b2b227b82123 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen) { int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -419,7 +419,7 @@ out: return ret; } -int __cpuinit cpu_up(unsigned int cpu) +int cpu_up(unsigned int cpu) { int err = 0; @@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init); * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -void __cpuinit notify_cpu_starting(unsigned int cpu) +void notify_cpu_starting(unsigned int cpu) { unsigned long val = CPU_STARTING; diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..f3e9dce39bc9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7630,7 +7630,7 @@ static void __init perf_event_init_all_cpus(void) } } -static void __cpuinit perf_event_init_cpu(int cpu) +static void perf_event_init_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -7719,7 +7719,7 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int __cpuinit +static int perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..403d2bb8a968 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1546,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links) } } -struct task_struct * __cpuinit fork_idle(int cpu) +struct task_struct *fork_idle(int cpu) { struct task_struct *task; task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f0f4fe29cd21..383319bae3f7 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void __cpuinit init_hrtimers_cpu(int cpu) +static void init_hrtimers_cpu(int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, +static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int scpu = (long)hcpu; @@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hrtimers_nb = { +static struct notifier_block hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/printk.c b/kernel/printk.c index d37d45c90ae6..69b0890ed7e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1921,7 +1921,7 @@ void resume_console(void) * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int __cpuinit console_cpu_notify(struct notifier_block *self, +static int console_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { diff --git a/kernel/profile.c b/kernel/profile.c index 0bf400737660..6631e1ef55ab 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -331,7 +331,7 @@ out: put_cpu(); } -static int __cpuinit profile_cpu_callback(struct notifier_block *info, +static int profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..5001c9887db1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan, * * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, +static int relay_hotcpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..b7c32cb7bfeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4133,7 +4133,7 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle) { idle->sched_class = &idle_sched_class; } @@ -4146,7 +4146,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -4630,7 +4630,7 @@ static void set_rq_offline(struct rq *rq) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit +static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; @@ -4684,12 +4684,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * happens before everything else. This has to be lower priority than * the notifier in the perf_event subsystem, though. */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = { .notifier_call = migration_call, .priority = CPU_PRI_MIGRATION, }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { @@ -4702,7 +4702,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, } } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..bb456f44b7b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5506,7 +5506,7 @@ void nohz_balance_enter_idle(int cpu) set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, +static int sched_ilb_notifier(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..fe9f773d7114 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { +static struct notifier_block hotplug_cfd_notifier = { .notifier_call = hotplug_cfd, }; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 02fc5c933673..eb89e1807408 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -24,7 +24,7 @@ */ static DEFINE_PER_CPU(struct task_struct *, idle_threads); -struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +struct task_struct *idle_thread_get(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index ca25e6e704a2..be3d3514c325 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) } EXPORT_SYMBOL(send_remote_softirq); -static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +static int remote_softirq_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { /* @@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +static struct notifier_block remote_softirq_cpu_notifier = { .notifier_call = remote_softirq_cpu_notify, }; @@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit cpu_callback(struct notifier_block *nfb, +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69601726a745..e80183f4a6c4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -298,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { diff --git a/kernel/timer.c b/kernel/timer.c index 15bc1b41021d..4296d13db3d1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1505,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int __cpuinit init_timers_cpu(int cpu) +static int init_timers_cpu(int cpu) { int j; struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; + static char tvec_base_done[NR_CPUS]; if (!tvec_base_done[cpu]) { static char boot_done; @@ -1577,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea } } -static void __cpuinit migrate_timers(int cpu) +static void migrate_timers(int cpu) { struct tvec_base *old_base; struct tvec_base *new_base; @@ -1610,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit timer_cpu_notify(struct notifier_block *self, +static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1635,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata timers_nb = { +static struct notifier_block timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..0b72e816b8d0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4644,7 +4644,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -4697,7 +4697,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { -- cgit v1.2.3 From 991821c86c2fb6cc4104ce679247864dbc070a83 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:34 +0800 Subject: tracing: Use correct config guard CONFIG_STACK_TRACER We should use CONFIG_STACK_TRACER to guard readme text of stack tracer related file, not CONFIG_STACKTRACE. Link: http://lkml.kernel.org/r/51E3B3A2.8080609@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cd500bffd9b..25b91afc29e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3537,14 +3537,14 @@ static const char readme_msg[] = "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" "\t\t\t Read the contents for more information\n" #endif -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" #endif -#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_STACK_TRACER */ ; static ssize_t -- cgit v1.2.3 From b9b3259746d77f4fcb786e2a43c25bcc40773755 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Jul 2013 16:05:51 -0700 Subject: sysfs.h: add __ATTR_RW() macro A number of parts of the kernel created their own version of this, might as well have the sysfs core provide it instead. Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..dd9878029d1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6234,8 +6234,6 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } -#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) - static struct device_attribute pmu_dev_attrs[] = { __ATTR_RO(type), __ATTR_RW(perf_event_mux_interval_ms), -- cgit v1.2.3 From e69f61862ab833e9b8d3c15b6ce07fd69f3bfecc Mon Sep 17 00:00:00 2001 From: Yacine Belkadi Date: Fri, 12 Jul 2013 20:45:47 +0200 Subject: sched: Fix some kernel-doc warnings When building the htmldocs (in verbose mode), scripts/kernel-doc reports the follwing type of warnings: Warning(kernel/sched/core.c:936): No description found for return value of 'task_curr' ... Fix those by: - adding the missing descriptions - using "Return" sections for the descriptions Signed-off-by: Yacine Belkadi Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1373654747-2389-1-git-send-email-yacine.belkadi.1@gmail.com [ While at it, fix the cpupri_set() explanation. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 82 ++++++++++++++++++++++++++++++++++++++------------- kernel/sched/cpupri.c | 4 +-- kernel/sched/fair.c | 9 ++++-- 3 files changed, 70 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..4c3967f91e20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -933,6 +933,8 @@ static int effective_prio(struct task_struct *p) /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. + * + * Return: 1 if the task is currently executing. 0 otherwise. */ inline int task_curr(const struct task_struct *p) { @@ -1482,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * Returns %true if @p was woken up, %false if it was already running + * Return: %true if @p was woken up, %false if it was already running. * or @state didn't match @p's state. */ static int @@ -1577,8 +1579,9 @@ out: * @p: The process to be woken up. * * Attempt to wake up the nominated process and move it to the set of runnable - * processes. Returns 1 if the process was woken up, 0 if it was already - * running. + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. @@ -2191,6 +2194,8 @@ void scheduler_tick(void) * This makes sure that uptime, CFS vruntime, load * balancing, etc... continue to move forward, even * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds. */ u64 scheduler_tick_max_deferment(void) { @@ -2796,8 +2801,8 @@ EXPORT_SYMBOL(wait_for_completion); * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -2829,8 +2834,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. The caller is accounted as waiting for IO. * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. */ unsigned long __sched wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) @@ -2846,7 +2851,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); * This waits for completion of a specific task to be signaled. It is * interruptible. * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -2865,8 +2870,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -2883,7 +2888,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -2903,8 +2908,8 @@ EXPORT_SYMBOL(wait_for_completion_killable); * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, @@ -2918,7 +2923,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * - * Returns: 0 if a decrement cannot be done without blocking + * Return: 0 if a decrement cannot be done without blocking * 1 if a decrement succeeded. * * If a completion is being used as a counting completion, @@ -2945,7 +2950,7 @@ EXPORT_SYMBOL(try_wait_for_completion); * completion_done - Test to see if a completion has any waiters * @x: completion structure * - * Returns: 0 if there are waiters (wait_for_completion() in progress) + * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * */ @@ -3182,7 +3187,7 @@ SYSCALL_DEFINE1(nice, int, increment) * task_prio - return the priority value of a given task. * @p: the task in question. * - * This is the priority value as seen by users in /proc. + * Return: The priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ @@ -3194,6 +3199,8 @@ int task_prio(const struct task_struct *p) /** * task_nice - return the nice value of a given task. * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. */ int task_nice(const struct task_struct *p) { @@ -3204,6 +3211,8 @@ EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. */ int idle_cpu(int cpu) { @@ -3226,6 +3235,8 @@ int idle_cpu(int cpu) /** * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. + * + * Return: The idle task for the cpu @cpu. */ struct task_struct *idle_task(int cpu) { @@ -3235,6 +3246,8 @@ struct task_struct *idle_task(int cpu) /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. */ static struct task_struct *find_process_by_pid(pid_t pid) { @@ -3432,6 +3445,8 @@ recheck: * @policy: new policy. * @param: structure containing the new RT priority. * + * Return: 0 on success. An error code otherwise. + * * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, @@ -3451,6 +3466,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * current context has permission. For example, this is needed in * stop_machine(): we create temporary high priority worker threads, * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) @@ -3485,6 +3502,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @pid: the pid in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) @@ -3500,6 +3519,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { @@ -3509,6 +3530,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { @@ -3535,6 +3559,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { @@ -3659,6 +3686,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -3710,6 +3739,8 @@ out_unlock: * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -3744,6 +3775,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. + * + * Return: 0. */ SYSCALL_DEFINE0(sched_yield) { @@ -3869,7 +3902,7 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns: + * Return: * true (>0) if we indeed boosted the target task. * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. @@ -3972,8 +4005,9 @@ long __sched io_schedule_timeout(long timeout) * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { @@ -3997,8 +4031,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { @@ -4024,6 +4059,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) @@ -6632,6 +6670,8 @@ void normalize_rt_tasks(void) * @cpu: the processor in question. * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + * + * Return: The current task for @cpu. */ struct task_struct *curr_task(int cpu) { diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1095e878a46f..8b836b376d91 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -62,7 +62,7 @@ static int convert_prio(int prio) * any discrepancies created by racing against the uncertainty of the current * priority configuration. * - * Returns: (int)bool - CPUs were found + * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) @@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * cpupri_init - initialize the cpupri structure * @cp: The cpupri context * - * Returns: -ENOMEM if memory fails. + * Return: -ENOMEM on memory allocation failure. */ int cpupri_init(struct cpupri *cp) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..98d135584b4b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4280,6 +4280,8 @@ struct sg_lb_stats { * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * + * Return: The load index. */ static inline int get_sd_load_idx(struct sched_domain *sd, enum cpu_idle_type idle) @@ -4574,6 +4576,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, * * Determine if @sg is a busier group than the previously selected * busiest group. + * + * Return: %true if @sg is a busier group than the previously selected + * busiest group. %false otherwise. */ static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, @@ -4691,7 +4696,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, * assuming lower CPU number will be equivalent to lower a SMT thread * number. * - * Returns 1 when packing is required and a task should be moved to + * Return: 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * * @env: The load balancing environment. @@ -4869,7 +4874,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * - * Returns: - the busiest group if imbalance exists. + * Return: - The busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. -- cgit v1.2.3 From 146c3442f2dd0f50d9431aea5d0d10dfd97c9999 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:44 +0800 Subject: tracing: Use trace_seq_puts()/trace_seq_putc() where possible For string without format specifiers, use trace_seq_puts() or trace_seq_putc(). Link: http://lkml.kernel.org/r/51E3B3AC.1000605@huawei.com Signed-off-by: zhangwei(Jovi) [ fixed a trace_seq_putc(s, " ") to trace_seq_putc(s, ' ') ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++---- kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_functions_graph.c | 52 ++++++++++++++++++------------------ kernel/trace/trace_mmiotrace.c | 8 +++--- kernel/trace/trace_output.c | 14 +++++----- kernel/trace/trace_syscalls.c | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e444ff88f0a4..eef2e566b2e7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "# compressed entry header\n"); - ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_printf(s, "\tarray : 32 bits\n"); - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_puts(s, "# compressed entry header\n"); + ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_puts(s, "\tarray : 32 bits\n"); + ret = trace_seq_putc(s, '\n'); ret = trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0d883dc057d6..0c7b75a8acc8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -646,7 +646,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_puts(s, "none\n"); mutex_unlock(&event_mutex); } @@ -660,7 +660,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8388bc99f2ee..d56ae9bae00b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* First spaces to align center */ for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* Last spaces to align center */ for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, " ------------------------------------------\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " => "); + ret = trace_seq_puts(s, " => "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, "\n ------------------------------------------\n\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return ret; if (type == TRACE_GRAPH_ENT) - ret = trace_seq_printf(s, "==========>"); + ret = trace_seq_puts(s, "==========>"); else - ret = trace_seq_printf(s, "<=========="); + ret = trace_seq_puts(s, "<=========="); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) len += strlen(nsecs_str); } - ret = trace_seq_printf(s, " us "); + ret = trace_seq_puts(s, " us "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Print remaining spaces to fit the row's width */ for (i = len; i < 7; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, /* No real adata, just filling the column with spaces */ switch (duration) { case DURATION_FILL_FULL: - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_START: - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_END: - ret = trace_seq_printf(s, " |"); + ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) - ret = trace_seq_printf(s, "! "); + ret = trace_seq_puts(s, "! "); /* Duration exceeded 10 msecs */ else if (duration > 10000ULL) - ret = trace_seq_printf(s, "+ "); + ret = trace_seq_puts(s, "+ "); } /* @@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, * to fill out the space. */ if (ret == -1) - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); /* Catching here any failure happenned above */ if (!ret) @@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "| "); + ret = trace_seq_puts(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. */ if (func_match) { - ret = trace_seq_printf(s, "}\n"); + ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { @@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, /* Indentation */ if (depth > 0) for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* The comment */ - ret = trace_seq_printf(s, "/* "); + ret = trace_seq_puts(s, "/* "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, s->len--; } - ret = trace_seq_printf(s, " */\n"); + ret = trace_seq_puts(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a5e8f4878bfa..b3dcfb2f0fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) if (drv) ret += trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_printf(s, " \n"); + ret += trace_seq_puts(s, " \n"); return ret; } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) struct header_iter *hiter; struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION 20070824\n"); + trace_seq_puts(s, "VERSION 20070824\n"); hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); if (!hiter) @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_printf(s, "rw what?\n"); + ret = trace_seq_puts(s, "rw what?\n"); break; } if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_printf(s, "map what?\n"); + ret = trace_seq_puts(s, "map what?\n"); break; } if (ret) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bb922d9ee51b..34e7cbac0c9c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_puts(s, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ret) ret = trace_seq_puts(s, "??"); if (ret) - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); } if (mm) @@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) int ret; if (!ip) - return trace_seq_printf(s, "0"); + return trace_seq_putc(s, '0'); if (sym_flags & TRACE_ITER_SYM_OFFSET) ret = seq_print_sym_offset(s, "%s", ip); @@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, goto partial; if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_printf(s, " <-")) + if (!trace_seq_puts(s, " <-")) goto partial; if (!seq_print_ip_sym(s, field->parent_ip, flags)) goto partial; } - if (!trace_seq_printf(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; return TRACE_TYPE_HANDLED; @@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, if (!seq_print_ip_sym(s, *p, flags)) goto partial; - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 322e16461072..061156215721 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, entry = syscall_nr_to_meta(syscall); if (!entry) { - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); return TRACE_TYPE_HANDLED; } -- cgit v1.2.3 From d611851b421731e2afd9cb956daae001af57a423 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Mon, 15 Jul 2013 16:32:50 +0800 Subject: tracing: Typo fix on ring buffer comments There have some mismatch between comments with real function name, update it. This patch also add some missed function arguments description. Link: http://lkml.kernel.org/r/51E3B3B2.4080307@huawei.com Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eef2e566b2e7..cc2f66f68dc5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, } /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not @@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self, #endif /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work) * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. + * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * BUF_PAGE_SIZE. * @@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * expected. * * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); * an intervening ring_buffer_read_prepare_sync must have been * performed. * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables the recording to the buffer, and frees the @@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions @@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_read_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); -- cgit v1.2.3 From b8ebfd3f7113b63dda93d76bfec638c00e6bd514 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:04 +0200 Subject: tracing/function: Avoid perf_trace_buf_*() if event_function.perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit(head, task => NULL) make no sense if hlist_empty(head). Change perf_ftrace_function_call() to check event_function.perf_events beforehand. Link: http://lkml.kernel.org/r/20130617170204.GA19803@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045faba..12df5573086e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -266,6 +266,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct pt_regs regs; int rctx; + head = this_cpu_ptr(event_function.perf_events); + if (hlist_empty(head)) + return; + #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) @@ -279,8 +283,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - - head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 1, ®s, head, NULL); -- cgit v1.2.3 From 421c7860c6e1989da3962fafdd6699316c9f8e20 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:07 +0200 Subject: tracing/syscall: Avoid perf_trace_buf_*() if sys_data->perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit(head, task => NULL) make no sense if hlist_empty(head). Change perf_syscall_enter/exit() to check sys_data->{enter,exit}_event->perf_events beforehand. Link: http://lkml.kernel.org/r/20130617170207.GA19806@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 061156215721..ac0085777fbd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -566,6 +566,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* get the size after alignment with the u32 buffer size field */ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); @@ -583,8 +587,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - - head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } @@ -642,6 +644,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) + return; + /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -661,8 +667,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - - head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -- cgit v1.2.3 From cd92bf61d6d70bd3eb33b46d600e3f3eb9c5778a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Jun 2013 19:02:11 +0200 Subject: tracing/perf: Move the PERF_MAX_TRACE_SIZE check into perf_trace_buf_prepare() Every perf_trace_buf_prepare() caller does WARN_ONCE(size > PERF_MAX_TRACE_SIZE, message) and "message" is almost the same. Shift this WARN_ONCE() into perf_trace_buf_prepare(). This changes the meaning of _ONCE, but I think this is fine. - 4947014 2932448 10104832 17984294 1126b26 vmlinux + 4948422 2932448 10104832 17985702 11270a6 vmlinux on my build. Link: http://lkml.kernel.org/r/20130617170211.GA19813@redhat.com Acked-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 ++++ kernel/trace/trace_kprobe.c | 6 ------ kernel/trace/trace_syscalls.c | 12 ------------ kernel/trace/trace_uprobe.c | 2 -- 4 files changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 12df5573086e..80c36bcf66e8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return NULL; + pc = preempt_count(); *rctxp = perf_swevent_get_recursion_context(); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ed6976493c8..ae6ce835b023 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1087,9 +1087,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) @@ -1120,9 +1117,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ac0085777fbd..8fd03657bc7d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -575,10 +575,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) - return; - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->event.type, regs, &rctx); if (!rec) @@ -652,14 +648,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - /* - * Impossible, but be paranoid with the future - * How to put this check outside runtime? - */ - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "exit event has grown above perf buffer size")) - return; - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->event.type, regs, &rctx); if (!rec) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d5d0cd368a56..a23d2d71188e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -818,8 +818,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; preempt_disable(); head = this_cpu_ptr(call->perf_events); -- cgit v1.2.3 From a232e270dcb55a70ad3241bc6fc160fd9b5c9e6c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 9 Jul 2013 18:35:26 +0900 Subject: tracing/kprobe: Wait for disabling all running kprobe handlers Wait for disabling all running kprobe handlers when a kprobe event is disabled, since the caller, trace_remove_event_call() supposes that a removing event is disabled completely by disabling the event. With this change, ftrace can ensure that there is no running event handlers after disabling it. Link: http://lkml.kernel.org/r/20130709093526.20138.93100.stgit@mhiramat-M0-7522 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ae6ce835b023..3811487e7a7a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -243,11 +243,11 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) static int disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { + struct event_file_link *link = NULL; + int wait = 0; int ret = 0; if (file) { - struct event_file_link *link; - link = find_event_file_link(tp, file); if (!link) { ret = -EINVAL; @@ -255,10 +255,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } list_del_rcu(&link->list); - /* synchronize with kprobe_trace_func/kretprobe_trace_func */ - synchronize_sched(); - kfree(link); - + wait = 1; if (!list_empty(&tp->files)) goto out; @@ -271,8 +268,22 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); + wait = 1; } out: + if (wait) { + /* + * Synchronize with kprobe_trace_func/kretprobe_trace_func + * to ensure disabled (all running handlers are finished). + * This is not only for kfree(), but also the caller, + * trace_remove_event_call() supposes it for releasing + * event_call related objects, which will be accessed in + * the kprobe_trace_func/kretprobe_trace_func. + */ + synchronize_sched(); + kfree(link); /* Ignored if link == NULL */ + } + return ret; } -- cgit v1.2.3 From 609e85a70bcd0eedf4ec60639dbcfb1ab011e054 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Wed, 10 Jul 2013 17:34:34 -0700 Subject: tracing: Fix error handling to ensure instances can always be removed Remove debugfs directories for tracing instances during creation if an error occurs causing the trace_array for that instance to not be added to ftrace_trace_arrays. If the directory continues to exist after the error, it cannot be removed because the respective trace_array is not in ftrace_trace_arrays. Link: http://lkml.kernel.org/r/1373502874-1706-2-git-send-email-azl@google.com Cc: stable@vger.kernel.org # 3.10 Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25b91afc29e0..7c3da7bca05b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5973,8 +5973,10 @@ static int new_instance_create(const char *name) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); - if (ret) + if (ret) { + debugfs_remove_recursive(tr->dir); goto out_free_tr; + } init_tracer_debugfs(tr, tr->dir); -- cgit v1.2.3 From f77d09a384676bde6445413949d9d2c508ff3e62 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Thu, 18 Jul 2013 11:18:44 -0700 Subject: tracing: Miscellaneous fixes for trace_array ref counting Some error paths did not handle ref counting properly, and some trace files need ref counting. Link: http://lkml.kernel.org/r/1374171524-11948-1-git-send-email-azl@google.com Cc: stable@vger.kernel.org # 3.10 Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++++++++------ kernel/trace/trace_events.c | 21 +++++++++++++++++++-- 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c3da7bca05b..7d9ceab42564 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3008,7 +3008,6 @@ static int tracing_release(struct inode *inode, struct file *file) iter = m->private; tr = iter->tr; - trace_array_put(tr); mutex_lock(&trace_types_lock); @@ -3023,6 +3022,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); + + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -3447,6 +3449,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; + int ret; if (tracing_disabled) return -ENODEV; @@ -3454,7 +3457,11 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) if (trace_array_get(tr) < 0) return -ENODEV; - return single_open(file, tracing_trace_options_show, inode->i_private); + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } static const struct file_operations tracing_iter_fops = { @@ -3958,6 +3965,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; + __trace_array_put(tr); goto out; } @@ -4704,21 +4712,24 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) - return -ENOMEM; + goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); - return -ENOMEM; + goto out; } + ret = 0; + iter->tr = tr; iter->trace_buffer = &tc->tr->max_buffer; iter->cpu_file = tc->cpu; m->private = iter; file->private_data = m; } - +out: if (ret < 0) trace_array_put(tr); @@ -5328,9 +5339,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tc, .read = tracing_stats_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tc, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7d854290bf81..7a75cb22eab7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1218,6 +1218,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, @@ -1245,7 +1246,7 @@ static const struct file_operations ftrace_set_event_fops = { .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, - .release = seq_release, + .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { @@ -1323,6 +1324,15 @@ ftrace_event_open(struct inode *inode, struct file *file, return ret; } +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file) { @@ -1336,12 +1346,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); - return ftrace_event_open(inode, file, seq_ops); + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; } static struct event_subsystem * -- cgit v1.2.3 From 8f768993394a8c0d3801033c11fd86ce8c88dcac Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 18 Jul 2013 14:41:51 -0400 Subject: tracing: Add ref_data to function and fgraph tracer structs The selftest for function and function graph tracers are defined as __init, as they are only executed at boot up. The "tracer" structs that are associated to those tracers are not setup as __init as they are used after boot. To stop mismatch warnings, those structures need to be annotated with __ref_data. Currently, the tracer structures are defined to __read_mostly, as they do not really change. But in the future they should be converted to consts, but that will take a little work because they have a "next" pointer that gets updated when they are registered. That will have to wait till the next major release. Link: http://lkml.kernel.org/r/1373596735.17876.84.camel@gandalf.local.home Reported-by: kbuild test robot Reported-by: Chen Gang Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++++ kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a4f6e1828b6..57b7bb0d39b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b863f93b30f3..38fe1483c508 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) return 0; } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data = { .name = "function", .init = function_trace_init, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56ae9bae00b..b5c09242683d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = { .funcs = &graph_functions }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = { .name = "function_graph", .open = graph_trace_open, .pipe_open = graph_trace_open, -- cgit v1.2.3 From 7710b639953b791610f0022a7d52d9801c93b969 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 18 Jul 2013 20:47:10 +0200 Subject: tracing: Simplify the iteration logic in f_start/f_next f_next() looks overcomplicated, and it is not strictly correct even if this doesn't matter. Say, FORMAT_FIELD_SEPERATOR should not return NULL (means EOF) if trace_get_fields() returns an empty list, we should simply advance to FORMAT_PRINTFMT as we do when we find the end of list. 1. Change f_next() to return "struct list_head *" rather than "ftrace_event_field *", and change f_show() to do list_entry(). This simplifies the code a bit, only f_show() needs to know about ftrace_event_field, and f_next() can play with ->prev directly 2. Change f_next() to not play with ->prev / return inside the switch() statement. It can simply set node = head/common_head, the prev-or-advance-to-the-next-magic below does all work. While at it. f_start() looks overcomplicated too. I don't think *pos == 0 makes sense as a separate case, just change this code to do "while" instead of "do/while". The patch also moves f_start() down, close to f_stop(). This is purely cosmetic, just to make the locking added by the next patch more clear/visible. Link: http://lkml.kernel.org/r/20130718184710.GA4783@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 60 +++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a75cb22eab7..76defd91f9b4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -826,59 +826,33 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); + struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - if (unlikely(list_empty(common_head))) - return NULL; - - field = list_entry(common_head->prev, - struct ftrace_event_field, link); - return field; + node = common_head; + break; case FORMAT_FIELD_SEPERATOR: - if (unlikely(list_empty(head))) - return NULL; - - field = list_entry(head->prev, struct ftrace_event_field, link); - return field; + node = head; + break; case FORMAT_PRINTFMT: /* all done */ return NULL; } - field = v; - if (field->link.prev == common_head) + node = node->prev; + if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; - else if (field->link.prev == head) + else if (node == head) return (void *)FORMAT_PRINTFMT; - - field = list_entry(field->link.prev, struct ftrace_event_field, link); - - return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ - loff_t l = 0; - void *p; - - /* Start by showing the header */ - if (!*pos) - return (void *)FORMAT_HEADER; - - p = (void *)FORMAT_HEADER; - do { - p = f_next(m, p, &l); - } while (p && l < *pos); - - return p; + else + return node; } static int f_show(struct seq_file *m, void *v) @@ -904,8 +878,7 @@ static int f_show(struct seq_file *m, void *v) return 0; } - field = v; - + field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: @@ -932,6 +905,17 @@ static int f_show(struct seq_file *m, void *v) return 0; } +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + static void f_stop(struct seq_file *m, void *p) { } -- cgit v1.2.3 From cd458ba9d5a5592d37b5145e560071e91ea762ac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 18 Jul 2013 20:47:12 +0200 Subject: tracing: Do not (ab)use trace_seq in event_id_read() event_id_read() has no reason to kmalloc "struct trace_seq" (more than PAGE_SIZE!), it can use a small buffer instead. Note: "if (*ppos) return 0" looks strange and even wrong, simple_read_from_buffer() handles ppos != 0 case corrrectly. And it seems that almost every user of trace_seq in this file should be converted too. Unless you use seq_open(), trace_seq buys nothing compared to the raw buffer, but it needs a bit more memory and code. Link: http://lkml.kernel.org/r/20130718184712.GA4786@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 76defd91f9b4..898f868833f2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -947,23 +947,14 @@ static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; + char buf[32]; + int len; if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->event.type); - - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - kfree(s); - return r; + len = sprintf(buf, "%d\n", call->event.type); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t -- cgit v1.2.3 From a644a7e9587802eabb2e229177606f6a74a60fc1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jul 2013 16:20:36 +0200 Subject: tracing: Kill trace_array->waiter Trivial. trace_array->waiter has no users since 6eaaa5d5 "tracing/core: use appropriate waiting on trace_pipe". Link: http://lkml.kernel.org/r/20130719142036.GA1594@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57b7bb0d39b7..e7d643b8a907 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -214,7 +214,6 @@ struct trace_array { struct dentry *event_dir; struct list_head systems; struct list_head events; - struct task_struct *waiter; int ref; }; -- cgit v1.2.3 From e70e78e3c83b536730e31231dd9b979768d8df3c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 19 Jul 2013 17:36:44 +0200 Subject: tracing: Kill the unbalanced tr->ref++ in tracing_buffers_open() tracing_buffers_open() does trace_array_get() and then it wrongly inrcements tr->ref again under trace_types_lock. This means that every caller leaks trace_array: # cd /sys/kernel/debug/tracing/ # mkdir instances/X # true < instances/X/per_cpu/cpu0/trace_pipe_raw # rmdir instances/X rmdir: failed to remove `instances/X': Device or resource busy Link: http://lkml.kernel.org/r/20130719153644.GA18899@redhat.com Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Masami Hiramatsu Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d9ceab42564..3f2477713aca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4959,8 +4959,6 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - tr->ref++; - info->iter.tr = tr; info->iter.cpu_file = tc->cpu; info->iter.trace = tr->current_trace; -- cgit v1.2.3 From 42577ca8c3616baaafdd8f167b2e1fb959026081 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 23 Jul 2013 16:49:24 +0100 Subject: Fix __wait_on_atomic_t() to call the action func if the counter != 0 Fix __wait_on_atomic_t() so that it calls the action func if the counter != 0 rather than if the counter is 0 so as to be analogous to __wait_on_bit(). Thanks to Yacine who found this by visual inspection. This will affect FS-Cache in that it will could fail to sleep correctly when trying to clean up after a netfs cookie is withdrawn. Reported-by: Yacine Belkadi Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Milosz Tanski Signed-off-by: Linus Torvalds --- kernel/wait.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index ce0daa320a26..dec68bd4e9d8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait(wq, &q->wait, mode); val = q->key.flags; if (atomic_read(val) == 0) - ret = (*action)(val); + break; + ret = (*action)(val); } while (!ret && atomic_read(val) != 0); finish_wait(wq, &q->wait); return ret; -- cgit v1.2.3 From 649e9c70da6bfbeb563193a35d3424a5aa7c0d38 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:25:54 +0200 Subject: tracing: Introduce trace_create_cpu_file() and tracing_get_cpu() Every "file_operations" used by tracing_init_debugfs_percpu is buggy. f_op->open/etc does: 1. struct trace_cpu *tc = inode->i_private; struct trace_array *tr = tc->tr; 2. trace_array_get(tr) or fail; 3. do_something(tc); But tc (and tr) can be already freed before trace_array_get() is called. And it doesn't matter whether this file is per-cpu or it was created by init_tracer_debugfs(), free_percpu() or kfree() are equally bad. Note that even 1. is not safe, the freed memory can be unmapped. But even if it was safe trace_array_get() can wrongly succeed if we also race with the next new_instance_create() which can re-allocate the same tr, or tc was overwritten and ->tr points to the valid tr. In this case 3. uses the freed/reused memory. Add the new trivial helper, trace_create_cpu_file() which simply calls trace_create_file() and encodes "cpu" in "struct inode". Another helper, tracing_get_cpu() will be used to read cpu_nr-or-RING_BUFFER_ALL_CPUS. The patch abuses ->i_cdev to encode the number, it is never used unless the file is S_ISCHR(). But we could use something else, say, i_bytes or even ->d_fsdata. In any case this hack is hidden inside these 2 helpers, it would be trivial to change them if needed. This patch only changes tracing_init_debugfs_percpu() to use the new trace_create_cpu_file(), the next patches will change file_operations. Note: tracing_get_cpu(inode) is always safe but you can't trust the result unless trace_array_get() was called, without trace_types_lock which acts as a barrier it can wrongly return RING_BUFFER_ALL_CPUS. Link: http://lkml.kernel.org/r/20130723152554.GA23710@redhat.com Cc: Al Viro Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3f2477713aca..cfff63c2148a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2843,6 +2843,17 @@ static int s_show(struct seq_file *m, void *v) return 0; } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -5529,6 +5540,17 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) +{ + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + ret->d_inode->i_cdev = (void *)(cpu + 1); + return ret; +} + static void tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { @@ -5548,28 +5570,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) } /* per cpu trace_pipe */ - trace_create_file("trace_pipe", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_pipe_fops); + trace_create_cpu_file("trace_pipe", 0444, d_cpu, + &data->trace_cpu, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_file("trace", 0644, d_cpu, - (void *)&data->trace_cpu, &tracing_fops); + trace_create_cpu_file("trace", 0644, d_cpu, + &data->trace_cpu, cpu, &tracing_fops); - trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_buffers_fops); + trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + &data->trace_cpu, cpu, &tracing_buffers_fops); - trace_create_file("stats", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_stats_fops); + trace_create_cpu_file("stats", 0444, d_cpu, + &data->trace_cpu, cpu, &tracing_stats_fops); - trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_entries_fops); + trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + &data->trace_cpu, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_cpu, - (void *)&data->trace_cpu, &snapshot_fops); + trace_create_cpu_file("snapshot", 0644, d_cpu, + &data->trace_cpu, cpu, &snapshot_fops); - trace_create_file("snapshot_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + &data->trace_cpu, cpu, &snapshot_raw_fops); #endif } -- cgit v1.2.3 From 15544209cb0b5312e5220a9337a1fe61d1a1f2d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:25:57 +0200 Subject: tracing: Change tracing_pipe_fops() to rely on tracing_get_cpu() tracing_open_pipe() is racy, the memory inode->i_private points to can be already freed. Change debugfs_create_file("trace_pipe", data) callers to to pass "data = tr", tracing_open_pipe() can use tracing_get_cpu(). Link: http://lkml.kernel.org/r/20130723152557.GA23717@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cfff63c2148a..51a99ef2a6e5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3959,8 +3959,7 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; @@ -4006,9 +4005,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = tc->cpu; - iter->tr = tc->tr; - iter->trace_buffer = &tc->tr->trace_buffer; + iter->tr = tr; + iter->trace_buffer = &tr->trace_buffer; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); filp->private_data = iter; @@ -4031,8 +4030,7 @@ fail: static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; mutex_lock(&trace_types_lock); @@ -5571,7 +5569,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) /* per cpu trace_pipe */ trace_create_cpu_file("trace_pipe", 0444, d_cpu, - &data->trace_cpu, cpu, &tracing_pipe_fops); + tr, cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_cpu_file("trace", 0644, d_cpu, @@ -6157,7 +6155,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) (void *)&tr->trace_cpu, &tracing_fops); trace_create_file("trace_pipe", 0444, d_tracer, - (void *)&tr->trace_cpu, &tracing_pipe_fops); + tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, (void *)&tr->trace_cpu, &tracing_entries_fops); -- cgit v1.2.3 From 46ef2be0d1d5ccea0c41bb606143586daadd537c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:26:00 +0200 Subject: tracing: Change tracing_buffers_fops to rely on tracing_get_cpu() tracing_buffers_open() is racy, the memory inode->i_private points to can be already freed. Change debugfs_create_file("trace_pipe_raw", data) caller to pass "data = tr", tracing_buffers_open() can use tracing_get_cpu(). Change debugfs_create_file("snapshot_raw_fops", data) caller too, this file uses tracing_buffers_open/release. Link: http://lkml.kernel.org/r/20130723152600.GA23720@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 51a99ef2a6e5..30c058a56ffb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4949,8 +4949,7 @@ static const struct file_operations snapshot_raw_fops = { static int tracing_buffers_open(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; int ret; @@ -4969,7 +4968,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); info->iter.tr = tr; - info->iter.cpu_file = tc->cpu; + info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; info->iter.trace_buffer = &tr->trace_buffer; info->spare = NULL; @@ -5576,7 +5575,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) &data->trace_cpu, cpu, &tracing_fops); trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, - &data->trace_cpu, cpu, &tracing_buffers_fops); + tr, cpu, &tracing_buffers_fops); trace_create_cpu_file("stats", 0444, d_cpu, &data->trace_cpu, cpu, &tracing_stats_fops); @@ -5589,7 +5588,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) &data->trace_cpu, cpu, &snapshot_fops); trace_create_cpu_file("snapshot_raw", 0444, d_cpu, - &data->trace_cpu, cpu, &snapshot_raw_fops); + tr, cpu, &snapshot_raw_fops); #endif } -- cgit v1.2.3 From 4d3435b8a4c3357695e09c5e7a3bf73a19fca5b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:26:03 +0200 Subject: tracing: Change tracing_stats_fops to rely on tracing_get_cpu() tracing_open_generic_tc() is racy, the memory inode->i_private points to can be already freed. 1. Change one of its users, tracing_stats_fops, to use tracing_*_generic_tr() instead. 2. Change trace_create_cpu_file("stats", data) to pass "data = tr". 3. Change tracing_stats_read() to use tracing_get_cpu(). Link: http://lkml.kernel.org/r/20130723152603.GA23727@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 30c058a56ffb..e29dc8f69aac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2982,7 +2982,6 @@ static int tracing_open_generic_tr(struct inode *inode, struct file *filp) filp->private_data = inode->i_private; return 0; - } static int tracing_open_generic_tc(struct inode *inode, struct file *filp) @@ -5285,14 +5284,14 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; struct trace_buffer *trace_buf = &tr->trace_buffer; + int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; - int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -5345,10 +5344,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic_tc, + .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, - .release = tracing_release_generic_tc, + .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -5578,7 +5577,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) tr, cpu, &tracing_buffers_fops); trace_create_cpu_file("stats", 0444, d_cpu, - &data->trace_cpu, cpu, &tracing_stats_fops); + tr, cpu, &tracing_stats_fops); trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, &data->trace_cpu, cpu, &tracing_entries_fops); -- cgit v1.2.3 From 0bc392ee46d0fd8e6b678457ef71f074f19a03c5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:26:06 +0200 Subject: tracing: Change tracing_entries_fops to rely on tracing_get_cpu() tracing_open_generic_tc() is racy, the memory inode->i_private points to can be already freed. 1. Change its last user, tracing_entries_fops, to use tracing_*_generic_tr() instead. 2. Change debugfs_create_file("buffer_size_kb", data) callers to pass "data = tr". 3. Change tracing_entries_read() and tracing_entries_write() to use tracing_get_cpu(). 4. Kill the no longer used tracing_open_generic_tc() and tracing_release_generic_tc(). Link: http://lkml.kernel.org/r/20130723152606.GA23730@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 49 ++++++++++++------------------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e29dc8f69aac..68b46851666f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2984,23 +2984,6 @@ static int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } -static int tracing_open_generic_tc(struct inode *inode, struct file *filp) -{ - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; - - filp->private_data = inode->i_private; - - return 0; - -} - static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; @@ -3054,15 +3037,6 @@ static int tracing_release_generic_tr(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tc(struct inode *inode, struct file *file) -{ - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - - trace_array_put(tr); - return 0; -} - static int tracing_single_release_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -4382,15 +4356,16 @@ static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (tc->cpu == RING_BUFFER_ALL_CPUS) { + if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -4417,7 +4392,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -4429,7 +4404,8 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; unsigned long val; int ret; @@ -4443,8 +4419,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - - ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; @@ -4892,11 +4867,11 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic_tc, + .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, - .release = tracing_release_generic_tc, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_total_entries_fops = { @@ -5580,7 +5555,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) tr, cpu, &tracing_stats_fops); trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, - &data->trace_cpu, cpu, &tracing_entries_fops); + tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_cpu_file("snapshot", 0644, d_cpu, @@ -6156,7 +6131,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_entries_fops); + tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); -- cgit v1.2.3 From 6484c71cbc170634fa131b6d022d86d61686b88b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:26:10 +0200 Subject: tracing: Change tracing_fops/snapshot_fops to rely on tracing_get_cpu() tracing_open() and tracing_snapshot_open() are racy, the memory inode->i_private points to can be already freed. Convert these last users of "inode->i_private == trace_cpu" to use "i_private = trace_array" and rely on tracing_get_cpu(). v2: incorporate the fix from Steven, tracing_release() must not blindly dereference file->private_data unless we know that the file was opened for reading. Link: http://lkml.kernel.org/r/20130723152610.GA23737@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68b46851666f..dd7780ddde08 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2862,9 +2862,9 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct trace_array *tr, struct trace_cpu *tc, - struct inode *inode, struct file *file, bool snapshot) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; @@ -2905,8 +2905,8 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc, iter->trace_buffer = &tr->trace_buffer; iter->snapshot = snapshot; iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); - iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) @@ -2986,22 +2986,18 @@ static int tracing_open_generic_tr(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; - struct trace_array *tr; int cpu; - /* Writes do not use seq_file, need to grab tr from inode */ if (!(file->f_mode & FMODE_READ)) { - struct trace_cpu *tc = inode->i_private; - - trace_array_put(tc->tr); + trace_array_put(tr); return 0; } + /* Writes do not use seq_file */ iter = m->private; - tr = iter->tr; - mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { @@ -3048,8 +3044,7 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file) static int tracing_open(struct inode *inode, struct file *file) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; @@ -3057,16 +3052,17 @@ static int tracing_open(struct inode *inode, struct file *file) return -ENODEV; /* If this file was open for write, then erase contents */ - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - if (tc->cpu == RING_BUFFER_ALL_CPUS) + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); + + if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(&tr->trace_buffer, tc->cpu); + tracing_reset(&tr->trace_buffer, cpu); } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(tr, tc, inode, file, false); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -4680,8 +4676,7 @@ struct ftrace_buffer_info { #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; int ret = 0; @@ -4690,7 +4685,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) return -ENODEV; if (file->f_mode & FMODE_READ) { - iter = __tracing_open(tr, tc, inode, file, true); + iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { @@ -4707,8 +4702,8 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) ret = 0; iter->tr = tr; - iter->trace_buffer = &tc->tr->max_buffer; - iter->cpu_file = tc->cpu; + iter->trace_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } @@ -5525,7 +5520,6 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, static void tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5546,7 +5540,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) /* per cpu trace */ trace_create_cpu_file("trace", 0644, d_cpu, - &data->trace_cpu, cpu, &tracing_fops); + tr, cpu, &tracing_fops); trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, tr, cpu, &tracing_buffers_fops); @@ -5559,7 +5553,7 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) #ifdef CONFIG_TRACER_SNAPSHOT trace_create_cpu_file("snapshot", 0644, d_cpu, - &data->trace_cpu, cpu, &snapshot_fops); + tr, cpu, &snapshot_fops); trace_create_cpu_file("snapshot_raw", 0444, d_cpu, tr, cpu, &snapshot_raw_fops); @@ -6125,7 +6119,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) tr, &tracing_iter_fops); trace_create_file("trace", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_fops); + tr, &tracing_fops); trace_create_file("trace_pipe", 0444, d_tracer, tr, &tracing_pipe_fops); @@ -6146,11 +6140,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, - tr, &rb_simple_fops); + tr, &rb_simple_fops); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, - (void *)&tr->trace_cpu, &snapshot_fops); + tr, &snapshot_fops); #endif for_each_tracing_cpu(cpu) -- cgit v1.2.3 From 9c01fe4593db123c5a72dc36f0400f776e92c954 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jul 2013 17:26:13 +0200 Subject: tracing: Kill trace_cpu struct/members After the previous changes trace_array_cpu->trace_cpu and trace_array->trace_cpu becomes write-only. Remove these members and kill "struct trace_cpu" as well. As a side effect this also removes memset(per_cpu_memory, 0). It was not needed, alloc_percpu() returns zero-filled memory. Link: http://lkml.kernel.org/r/20130723152613.GA23741@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 21 --------------------- kernel/trace/trace.h | 8 -------- 2 files changed, 29 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd7780ddde08..69cba470ea96 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5865,17 +5865,6 @@ struct dentry *trace_instance_dir; static void init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); -static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) -{ - int cpu; - - for_each_tracing_cpu(cpu) { - memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; - per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; - } -} - static int allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) { @@ -5893,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size return -ENOMEM; } - init_trace_buffers(tr, buf); - /* Allocate the first page for all buffers */ set_buffer_entries(&tr->trace_buffer, ring_buffer_size(tr->trace_buffer.buffer, 0)); @@ -5961,10 +5948,6 @@ static int new_instance_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - /* Holder for file callbacks */ - tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - tr->trace_cpu.tr = tr; - tr->dir = debugfs_create_dir(name, trace_instance_dir); if (!tr->dir) goto out_free_tr; @@ -6438,10 +6421,6 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; - /* Holder for file callbacks */ - global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - global_trace.trace_cpu.tr = &global_trace; - INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7d643b8a907..afaae41b0a02 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -130,19 +130,12 @@ enum trace_flag_type { struct trace_array; -struct trace_cpu { - struct trace_array *tr; - struct dentry *dir; - int cpu; -}; - /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { - struct trace_cpu trace_cpu; atomic_t disabled; void *buffer_page; /* ring buffer spare */ @@ -196,7 +189,6 @@ struct trace_array { bool allocated_snapshot; #endif int buffer_disabled; - struct trace_cpu trace_cpu; /* place holder */ #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; -- cgit v1.2.3 From 195a8afc7ac962f8da795549fe38e825f1372b0d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 23 Jul 2013 22:06:15 -0400 Subject: ftrace: Add check for NULL regs if ops has SAVE_REGS set If a ftrace ops is registered with the SAVE_REGS flag set, and there's already a ops registered to one of its functions but without the SAVE_REGS flag, there's a small race window where the SAVE_REGS ops gets added to the list of callbacks to call for that function before the callback trampoline gets set to save the regs. The problem is, the function is not currently saving regs, which opens a small race window where the ops that is expecting regs to be passed to it, wont. This can cause a crash if the callback were to reference the regs, as the SAVE_REGS guarantees that regs will be set. To fix this, we add a check in the loop case where it checks if the ops has the SAVE_REGS flag set, and if so, it will ignore it if regs is not set. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 67708f46baae..8ce9eefc5bb4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1441,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, * the hashes are freed with call_rcu_sched(). */ static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_hash *filter_hash; struct ftrace_hash *notrace_hash; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); @@ -4218,7 +4228,7 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { return 1; } @@ -4241,7 +4251,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip)) + ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); @@ -4274,7 +4284,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip)) + if (ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); -- cgit v1.2.3 From c2fda509667b0fda4372a237f5a59ea4570b1627 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Jul 2013 18:31:42 +0800 Subject: workqueue: allow work_on_cpu() to be called recursively If the @fn call work_on_cpu() again, the lockdep will complain: > [ INFO: possible recursive locking detected ] > 3.11.0-rc1-lockdep-fix-a #6 Not tainted > --------------------------------------------- > kworker/0:1/142 is trying to acquire lock: > ((&wfc.work)){+.+.+.}, at: [] flush_work+0x0/0xb0 > > but task is already holding lock: > ((&wfc.work)){+.+.+.}, at: [] process_one_work+0x169/0x610 > > other info that might help us debug this: > Possible unsafe locking scenario: > > CPU0 > ---- > lock((&wfc.work)); > lock((&wfc.work)); > > *** DEADLOCK *** It is false-positive lockdep report. In this sutiation, the two "wfc"s of the two work_on_cpu() are different, they are both on stack. flush_work() can't be deadlock. To fix this, we need to avoid the lockdep checking in this case, thus we instroduce a internal __flush_work() which skip the lockdep. tj: Minor comment adjustment. Signed-off-by: Lai Jiangshan Reported-by: "Srivatsa S. Bhat" Reported-by: Alexander Duyck Signed-off-by: Tejun Heo --- kernel/workqueue.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..55f5f0afcd0d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2817,6 +2817,19 @@ already_gone: return false; } +static bool __flush_work(struct work_struct *work) +{ + struct wq_barrier barr; + + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } +} + /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2830,18 +2843,10 @@ already_gone: */ bool flush_work(struct work_struct *work) { - struct wq_barrier barr; - lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } + return __flush_work(work); } EXPORT_SYMBOL_GPL(flush_work); @@ -4756,7 +4761,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); + + /* + * The work item is on-stack and can't lead to deadlock through + * flushing. Use __flush_work() to avoid spurious lockdep warnings + * when work_on_cpu()s are nested. + */ + __flush_work(&wfc.work); + return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.3 From 09d8091c024ec88d1541d93eb8ddb2bd5cf10c39 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 23 Jul 2013 22:21:59 -0400 Subject: tracing: Remove locking trace_types_lock from tracing_reset_all_online_cpus() Commit a82274151af "tracing: Protect ftrace_trace_arrays list in trace_events.c" added taking the trace_types_lock mutex in trace_events.c as there were several locations that needed it for protection. Unfortunately, it also encapsulated a call to tracing_reset_all_online_cpus() which also takes the trace_types_lock, causing a deadlock. This happens when a module has tracepoints and has been traced. When the module is removed, the trace events module notifier will grab the trace_types_lock, do a bunch of clean ups, and also clears the buffer by calling tracing_reset_all_online_cpus. This doesn't happen often which explains why it wasn't caught right away. Commit a82274151af was marked for stable, which means this must be sent to stable too. Link: http://lkml.kernel.org/r/51EEC646.7070306@broadcom.com Reported-by: Arend van Spril Tested-by: Arend van Spriel Cc: Alexander Z Lam Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 69cba470ea96..882ec1dd1515 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1224,18 +1224,17 @@ void tracing_reset_current(int cpu) tracing_reset(&global_trace.trace_buffer, cpu); } +/* Must have trace_types_lock held */ void tracing_reset_all_online_cpus(void) { struct trace_array *tr; - mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } - mutex_unlock(&trace_types_lock); } #define SAVED_CMDLINES 128 -- cgit v1.2.3 From d738ce8fdc05ebf5b1475f8ae26d908c8c50970b Mon Sep 17 00:00:00 2001 From: Francesco Fusco Date: Wed, 24 Jul 2013 10:39:07 +0200 Subject: sysctl: range checking in do_proc_dointvec_ms_jiffies_conv When (integer) sysctl values are expressed in ms and have to be represented internally as jiffies. The msecs_to_jiffies function returns an unsigned long, which gets assigned to the integer. This patch prevents the value to be assigned if bigger than INT_MAX, done in a similar way as in cba9f3 ("Range checking in do_proc_dointvec_(userhz_)jiffies_conv"). Signed-off-by: Francesco Fusco CC: Andrew Morton CC: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- kernel/sysctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac09d98490aa..07f6fc468e17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2346,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + + if (jif > INT_MAX) + return 1; + *valp = (int)jif; } else { int val = *valp; unsigned long lval; -- cgit v1.2.3 From 148519120c6d1f19ad53349683aeae9f228b0b8d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 27 Jul 2013 01:41:34 +0200 Subject: Revert "cpuidle: Quickly notice prediction failure for repeat mode" Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c 3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 e11538d1f03914eb92af5a1a378375c05ae8520c In summary, netperf TCP_RR numbers improve by approximately 4% after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commit e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 and e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. Requested-by: Jeremy Eder Tested-by: Len Brown Cc: 3.8+ Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e80183f4a6c4..e77edc97e036 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -827,13 +827,10 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (ts->inidle) { - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); + if (ts->inidle) __tick_nohz_idle_enter(ts); - } else { + else tick_nohz_full_stop_tick(ts); - } } /** @@ -931,8 +928,6 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); -- cgit v1.2.3 From 1a11126bcb7c93c289bf3218fa546fd3b0c0df8b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Jul 2013 19:25:32 +0200 Subject: tracing: Turn event/id->i_private into call->event.type event_id_read() is racy, ftrace_event_call can be already freed by trace_remove_event_call() callers. Change event_create_dir() to pass "data = call->event.type", this is all event_id_read() needs. ftrace_event_id_fops no longer needs tracing_open_generic(). We add the new helper, event_file_data(), to read ->i_private, it will have more users. Note: currently ACCESS_ONCE() and "id != 0" check are not needed, but we are going to change event_remove/rmdir to clear ->i_private. Link: http://lkml.kernel.org/r/20130726172532.GA3605@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 898f868833f2..c2d13c528c3c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -409,6 +409,11 @@ static void put_system(struct ftrace_subsystem_dir *dir) mutex_unlock(&event_mutex); } +static void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -946,14 +951,18 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + int id = (long)event_file_data(filp); char buf[32]; int len; if (*ppos) return 0; - len = sprintf(buf, "%d\n", call->event.type); + if (unlikely(!id)) + return -ENODEV; + + len = sprintf(buf, "%d\n", id); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } @@ -1240,7 +1249,6 @@ static const struct file_operations ftrace_event_format_fops = { }; static const struct file_operations ftrace_event_id_fops = { - .open = tracing_open_generic, .read = event_id_read, .llseek = default_llseek, }; @@ -1488,8 +1496,8 @@ event_create_dir(struct dentry *parent, #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, call, - id); + trace_create_file("id", 0444, file->dir, + (void *)(long)call->event.type, id); #endif /* -- cgit v1.2.3 From bc6f6b08dee5645770efb4b76186ded313f23752 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Jul 2013 19:25:36 +0200 Subject: tracing: Change event_enable/disable_read() to verify i_private != NULL tracing_open_generic_file() is racy, ftrace_event_file can be already freed by rmdir or trace_remove_event_call(). Change event_enable_read() and event_disable_read() to read and verify "file = i_private" under event_mutex. This fixes nothing, but now we can change debugfs_remove("enable") callers to nullify ->i_private and fix the the problem. Link: http://lkml.kernel.org/r/20130726172536.GA3612@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c2d13c528c3c..3dfa8419d0dc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -684,15 +684,25 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file = filp->private_data; + struct ftrace_event_file *file; + unsigned long flags; char buf[4] = "0"; - if (file->flags & FTRACE_EVENT_FL_ENABLED && - !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + flags = file->flags; + mutex_unlock(&event_mutex); + + if (!file) + return -ENODEV; + + if (flags & FTRACE_EVENT_FL_ENABLED && + !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || - file->flags & FTRACE_EVENT_FL_SOFT_MODE) + if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || + flags & FTRACE_EVENT_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); @@ -704,13 +714,10 @@ static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file = filp->private_data; + struct ftrace_event_file *file; unsigned long val; int ret; - if (!file) - return -EINVAL; - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -722,8 +729,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: case 1: + ret = -ENODEV; mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(file, val); + file = event_file_data(filp); + if (likely(file)) + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; -- cgit v1.2.3 From e2912b091c26b8ea95e5e00a43a7ac620f6c94a6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Jul 2013 19:25:40 +0200 Subject: tracing: Change event_filter_read/write to verify i_private != NULL event_filter_read/write() are racy, ftrace_event_call can be already freed by trace_remove_event_call() callers. 1. Shift mutex_lock(event_mutex) from print/apply_event_filter to the callers. 2. Change the callers, event_filter_read() and event_filter_write() to read i_private under this mutex and abort if it is NULL. This fixes nothing, but now we can change debugfs_remove("filter") callers to nullify ->i_private and fix the the problem. Link: http://lkml.kernel.org/r/20130726172540.GA3619@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 26 +++++++++++++++++++------- kernel/trace/trace_events_filter.c | 17 ++++++----------- 2 files changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3dfa8419d0dc..1d7b6d03cd51 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -980,21 +980,28 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call; struct trace_seq *s; - int r; + int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) return -ENOMEM; trace_seq_init(s); - print_event_filter(call, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + mutex_lock(&event_mutex); + call = event_file_data(filp); + if (call) + print_event_filter(call, s); + mutex_unlock(&event_mutex); + + if (call) + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -1005,9 +1012,9 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call; char *buf; - int err; + int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; @@ -1022,7 +1029,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_event_filter(call, buf); + mutex_lock(&event_mutex); + call = event_file_data(filp); + if (call) + err = apply_event_filter(call, buf); + mutex_unlock(&event_mutex); + free_page((unsigned long) buf); if (err < 0) return err; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c7b75a8acc8..97daa8cf958d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } +/* caller must hold event_mutex */ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter; + struct event_filter *filter = call->filter; - mutex_lock(&event_mutex); - filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_puts(s, "none\n"); - mutex_unlock(&event_mutex); } void print_subsystem_event_filter(struct event_subsystem *system, @@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system, return err; } +/* caller must hold event_mutex */ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { struct event_filter *filter; - int err = 0; - - mutex_lock(&event_mutex); + int err; if (!strcmp(strstrip(filter_string), "0")) { filter_disable(call); filter = call->filter; if (!filter) - goto out_unlock; + return 0; RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); - goto out_unlock; + return 0; } err = create_filter(call, filter_string, true, &filter); @@ -1884,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) __free_filter(tmp); } } -out_unlock: - mutex_unlock(&event_mutex); return err; } -- cgit v1.2.3 From c5a44a1200c6eda2202434f25325e8ad19533fca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Jul 2013 19:25:43 +0200 Subject: tracing: Change f_start() to take event_mutex and verify i_private != NULL trace_format_open() and trace_format_seq_ops are racy, nothing protects ftrace_event_call from trace_remove_event_call(). Change f_start() to take event_mutex and verify i_private != NULL, change f_stop() to drop this lock. This fixes nothing, but now we can change debugfs_remove("format") callers to nullify ->i_private and fix the the problem. Note: the usage of event_mutex is sub-optimal but simple, we can change this later. Link: http://lkml.kernel.org/r/20130726172543.GA3622@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d7b6d03cd51..50dc8b2e5435 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -840,7 +840,7 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; + struct ftrace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -872,7 +872,7 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = m->private; + struct ftrace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; @@ -925,6 +925,11 @@ static void *f_start(struct seq_file *m, loff_t *pos) void *p = (void *)FORMAT_HEADER; loff_t l = 0; + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + if (!event_file_data(m->private)) + return ERR_PTR(-ENODEV); + while (l < *pos && p) p = f_next(m, p, &l); @@ -933,6 +938,7 @@ static void *f_start(struct seq_file *m, loff_t *pos) static void f_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { @@ -944,7 +950,6 @@ static const struct seq_operations trace_format_seq_ops = { static int trace_format_open(struct inode *inode, struct file *file) { - struct ftrace_event_call *call = inode->i_private; struct seq_file *m; int ret; @@ -953,7 +958,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = call; + m->private = file; return 0; } -- cgit v1.2.3 From f6a84bdc75b5c11621dec58db73fe102cbaf40cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Jul 2013 19:25:47 +0200 Subject: tracing: Introduce remove_event_file_dir() Preparation for the next patch. Extract the common code from remove_event_from_tracers() and __trace_remove_event_dirs() into the new helper, remove_event_file_dir(). The patch looks more complicated than it actually is, it also moves remove_subsystem() up to avoid the forward declaration. Link: http://lkml.kernel.org/r/20130726172547.GA3629@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 47 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 50dc8b2e5435..05d647ecd01a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -409,11 +409,31 @@ static void put_system(struct ftrace_subsystem_dir *dir) mutex_unlock(&event_mutex); } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + static void *event_file_data(struct file *filp) { return ACCESS_ONCE(file_inode(filp)->i_private); } +static void remove_event_file_dir(struct ftrace_event_file *file) +{ + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -1549,33 +1569,16 @@ event_create_dir(struct dentry *parent, return 0; } -static void remove_subsystem(struct ftrace_subsystem_dir *dir) -{ - if (!dir) - return; - - if (!--dir->nr_events) { - debugfs_remove_recursive(dir->entry); - list_del(&dir->list); - __put_system_dir(dir); - } -} - static void remove_event_from_tracers(struct ftrace_event_call *call) { struct ftrace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { - if (file->event_call != call) continue; - list_del(&file->list); - debugfs_remove_recursive(file->dir); - remove_subsystem(file->system); - kmem_cache_free(file_cachep, file); - + remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this @@ -2305,12 +2308,8 @@ __trace_remove_event_dirs(struct trace_array *tr) { struct ftrace_event_file *file, *next; - list_for_each_entry_safe(file, next, &tr->events, list) { - list_del(&file->list); - debugfs_remove_recursive(file->dir); - remove_subsystem(file->system); - kmem_cache_free(file_cachep, file); - } + list_for_each_entry_safe(file, next, &tr->events, list) + remove_event_file_dir(file); } static void -- cgit v1.2.3 From bf682c3159c4d298d1126a56793ed3f5e80395f7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 28 Jul 2013 20:35:27 +0200 Subject: tracing: Change remove_event_file_dir() to clear "d_subdirs"->i_private Change remove_event_file_dir() to clear ->i_private for every file we are going to remove. We need to check file->dir != NULL because event_create_dir() can fail. debugfs_remove_recursive(NULL) is fine but the patch moves it under the same check anyway for readability. spin_lock(d_lock) and "d_inode != NULL" check are not needed afaics, but I do not understand this code enough. tracing_open_generic_file() and tracing_release_generic_file() can go away, ftrace_enable_fops and ftrace_event_filter_fops() use tracing_open_generic() but only to check tracing_disabled. This fixes all races with event_remove() or instance_delete(). f_op->read/write/whatever can never use the freed file/call, all event/* files were changed to check and use ->i_private under event_mutex. Note: this doesn't not fix other problems, event_remove() can destroy the active ftrace_event_call, we need more changes but those changes are completely orthogonal. Link: http://lkml.kernel.org/r/20130728183527.GB16723@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 47 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05d647ecd01a..a67c913e2f9f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -428,41 +428,25 @@ static void *event_file_data(struct file *filp) static void remove_event_file_dir(struct ftrace_event_file *file) { + struct dentry *dir = file->dir; + struct dentry *child; + + if (dir) { + spin_lock(&dir->d_lock); /* probably unneeded */ + list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + if (child->d_inode) /* probably unneeded */ + child->d_inode->i_private = NULL; + } + spin_unlock(&dir->d_lock); + + debugfs_remove_recursive(dir); + } + list_del(&file->list); - debugfs_remove_recursive(file->dir); remove_subsystem(file->system); kmem_cache_free(file_cachep, file); } -/* - * Open and update trace_array ref count. - * Must have the current trace_array passed to it. - */ -static int tracing_open_generic_file(struct inode *inode, struct file *filp) -{ - struct ftrace_event_file *file = inode->i_private; - struct trace_array *tr = file->tr; - int ret; - - if (trace_array_get(tr) < 0) - return -ENODEV; - - ret = tracing_open_generic(inode, filp); - if (ret < 0) - trace_array_put(tr); - return ret; -} - -static int tracing_release_generic_file(struct inode *inode, struct file *filp) -{ - struct ftrace_event_file *file = inode->i_private; - struct trace_array *tr = file->tr; - - trace_array_put(tr); - - return 0; -} - /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1281,10 +1265,9 @@ static const struct file_operations ftrace_set_event_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic_file, + .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, - .release = tracing_release_generic_file, .llseek = default_llseek, }; -- cgit v1.2.3 From 1c80c43290ee576afe8d39ecc905fa3958a5858c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 25 Jul 2013 20:22:00 -0400 Subject: ftrace: Consolidate some duplicate code for updating ftrace ops When ftrace ops modifies the functions that it will trace, the update to the function mcount callers may need to be modified. Consolidate the two places that do the checks to see if an update is required with a wrapper function for those checks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8ce9eefc5bb4..92d3334de0c3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3384,6 +3384,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } +static void ftrace_ops_update_code(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) @@ -3426,9 +3432,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(ops); mutex_unlock(&ftrace_lock); @@ -3655,9 +3660,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(iter->ops); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From 2b44c4db2e2f1765d35163a861d301038e0c8a75 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Wed, 24 Jul 2013 17:41:33 -0700 Subject: freezer: set PF_SUSPEND_TASK flag on tasks that call freeze_processes Calling freeze_processes sets a global flag that will cause any process that calls try_to_freeze to enter the refrigerator. It skips sending a signal to the current task, but if the current task ever hits try_to_freeze, all threads will be frozen and the system will deadlock. Set a new flag, PF_SUSPEND_TASK, on the task that calls freeze_processes. The flag notifies the freezer that the thread is involved in suspend and should not be frozen. Also add a WARN_ON in thaw_processes if the caller does not have the PF_SUSPEND_TASK flag set to catch if a different task calls thaw_processes than the one that called freeze_processes, leaving a task with PF_SUSPEND_TASK permanently set on it. Threads that spawn off a task with PF_SUSPEND_TASK set (which swsusp does) will also have PF_SUSPEND_TASK set, preventing them from freezing while they are helping with suspend, but they need to be dead by the time suspend is triggered, otherwise they may run when userspace is expected to be frozen. Add a WARN_ON in thaw_processes if more than one thread has the PF_SUSPEND_TASK flag set. Reported-and-tested-by: Michael Leun Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 2 +- kernel/power/process.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 8b2afc1c9df0..b462fa197517 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock); */ bool freezing_slow_path(struct task_struct *p) { - if (p->flags & PF_NOFREEZE) + if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/power/process.c b/kernel/power/process.c index fc0df8486449..06ec8869dbf1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -109,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ @@ -120,6 +122,9 @@ int freeze_processes(void) if (error) return error; + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + if (!pm_freezing) atomic_inc(&system_freezing_cnt); @@ -168,6 +173,7 @@ int freeze_kernel_threads(void) void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; if (pm_freezing) atomic_dec(&system_freezing_cnt); @@ -182,10 +188,15 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + usermodehelper_enable(); schedule(); -- cgit v1.2.3 From bf0bd948d1682e3996adc093b43021ed391983e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jul 2013 23:48:42 +0200 Subject: sched: Ensure update_cfs_shares() is called for parents of continuously-running tasks We typically update a task_group's shares within the dequeue/enqueue path. However, continuously running tasks sharing a CPU are not subject to these updates as they are only put/picked. Unfortunately, when we reverted f269ae046 (in 17bc14b7), we lost the augmenting periodic update that was supposed to account for this; resulting in a potential loss of fairness. To fix this, re-introduce the explicit update in update_cfs_rq_blocked_load() [called via entity_tick()]. Reported-by: Max Hailperin Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/n/tip-9545m3apw5d93ubyrotrj31y@git.kernel.org Cc: Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98d135584b4b..06db94bf47a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_entity_load_avg(curr, 1); update_cfs_rq_blocked_load(cfs_rq, 1); + update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* -- cgit v1.2.3 From 85f4896123d0299128f2c95cc40f3b8b01d4b0f6 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Tue, 30 Jul 2013 10:13:41 +0200 Subject: mutex: Fix w/w mutex deadlock injection The check needs to be for > 1, because ctx->acquired is already incremented. This will prevent ww_mutex_lock_slow from returning -EDEADLK and not locking the mutex. It caused a lot of false gpu lockups on radeon with CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y because a function that shouldn't be able to return -EDEADLK did. Signed-off-by: Maarten Lankhorst Signed-off-by: Peter Zijlstra Cc: Alex Deucher Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/51F775B5.201@canonical.com Signed-off-by: Ingo Molnar --- kernel/mutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index ff05f4bd86eb..a52ee7bb830d 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -686,7 +686,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, &ctx->dep_map, _RET_IP_, ctx); - if (!ret && ctx->acquired > 0) + if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; @@ -702,7 +702,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, &ctx->dep_map, _RET_IP_, ctx); - if (!ret && ctx->acquired > 0) + if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); return ret; -- cgit v1.2.3 From 8c4f3c3fa9681dc549cd35419b259496082fef8b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Jul 2013 00:04:32 -0400 Subject: ftrace: Check module functions being traced on reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's been a nasty bug that would show up and not give much info. The bug displayed the following warning: WARNING: at kernel/trace/ftrace.c:1529 __ftrace_hash_rec_update+0x1e3/0x230() Pid: 20903, comm: bash Tainted: G O 3.6.11+ #38405.trunk Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] __ftrace_hash_rec_update+0x1e3/0x230 [] ftrace_hash_move+0x28/0x1d0 [] ? kfree+0x2c/0x110 [] ftrace_regex_release+0x8e/0x150 [] __fput+0xae/0x220 [] ____fput+0xe/0x10 [] task_work_run+0x72/0x90 [] do_notify_resume+0x6c/0xc0 [] ? trace_hardirqs_on_thunk+0x3a/0x3c [] int_signal+0x12/0x17 ---[ end trace 793179526ee09b2c ]--- It was finally narrowed down to unloading a module that was being traced. It was actually more than that. When functions are being traced, there's a table of all functions that have a ref count of the number of active tracers attached to that function. When a function trace callback is registered to a function, the function's record ref count is incremented. When it is unregistered, the function's record ref count is decremented. If an inconsistency is detected (ref count goes below zero) the above warning is shown and the function tracing is permanently disabled until reboot. The ftrace callback ops holds a hash of functions that it filters on (and/or filters off). If the hash is empty, the default means to filter all functions (for the filter_hash) or to disable no functions (for the notrace_hash). When a module is unloaded, it frees the function records that represent the module functions. These records exist on their own pages, that is function records for one module will not exist on the same page as function records for other modules or even the core kernel. Now when a module unloads, the records that represents its functions are freed. When the module is loaded again, the records are recreated with a default ref count of zero (unless there's a callback that traces all functions, then they will also be traced, and the ref count will be incremented). The problem is that if an ftrace callback hash includes functions of the module being unloaded, those hash entries will not be removed. If the module is reloaded in the same location, the hash entries still point to the functions of the module but the module's ref counts do not reflect that. With the help of Steve and Joern, we found a reproducer: Using uinput module and uinput_release function. cd /sys/kernel/debug/tracing modprobe uinput echo uinput_release > set_ftrace_filter echo function > current_tracer rmmod uinput modprobe uinput # check /proc/modules to see if loaded in same addr, otherwise try again echo nop > current_tracer [BOOM] The above loads the uinput module, which creates a table of functions that can be traced within the module. We add uinput_release to the filter_hash to trace just that function. Enable function tracincg, which increments the ref count of the record associated to uinput_release. Remove uinput, which frees the records including the one that represents uinput_release. Load the uinput module again (and make sure it's at the same address). This recreates the function records all with a ref count of zero, including uinput_release. Disable function tracing, which will decrement the ref count for uinput_release which is now zero because of the module removal and reload, and we have a mismatch (below zero ref count). The solution is to check all currently tracing ftrace callbacks to see if any are tracing any of the module's functions when a module is loaded (it already does that with callbacks that trace all functions). If a callback happens to have a module function being traced, it increments that records ref count and starts tracing that function. There may be a strange side effect with this, where tracing module functions on unload and then reloading a new module may have that new module's functions being traced. This may be something that confuses the user, but it's not a big deal. Another approach is to disable all callback hashes on module unload, but this leaves some ftrace callbacks that may not be registered, but can still have hashes tracing the module's function where ftrace doesn't know about it. That situation can cause the same bug. This solution solves that case too. Another benefit of this solution, is it is possible to trace a module's function on unload and load. Link: http://lkml.kernel.org/r/20130705142629.GA325@redhat.com Reported-by: Jörn Engel Reported-by: Dave Jones Reported-by: Steve Hodgson Tested-by: Steve Hodgson Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 71 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92d3334de0c3..a6d098c6df3f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2169,12 +2169,57 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ops_traces_mod(struct ftrace_ops *ops) +static inline int ops_traces_mod(struct ftrace_ops *ops) { - struct ftrace_hash *hash; + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->filter_hash) && + ftrace_hash_empty(ops->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + /* If ops traces all mods, we already accounted for it */ + if (ops_traces_mod(ops)) + return 0; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->filter_hash) && + !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + return 0; - hash = ops->filter_hash; - return ftrace_hash_empty(hash); + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + return 0; + + return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; } static int ftrace_update_code(struct module *mod) @@ -2183,6 +2228,7 @@ static int ftrace_update_code(struct module *mod) struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + bool test = false; int i; /* @@ -2196,9 +2242,12 @@ static int ftrace_update_code(struct module *mod) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && - ops_traces_mod(ops)) - ref++; + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + if (ops_traces_mod(ops)) + ref++; + else + test = true; + } } } @@ -2208,12 +2257,16 @@ static int ftrace_update_code(struct module *mod) for (pg = ftrace_new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { + int cnt = ref; + /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - p->flags = ref; + if (test) + cnt += referenced_filters(p); + p->flags = cnt; /* * Do the initial record conversion from mcount jump @@ -2233,7 +2286,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up && ref) { + if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) ftrace_bug(failed, p->ip); -- cgit v1.2.3 From da0a12caffad2eeadea429f83818408e7b77379a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:16:28 +0800 Subject: cgroup: fix a leak when percpu_ref_init() fails ss->css_free() is not called when perfcpu_ref_init() fails. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index afb8d53ca6c7..468e410f9e61 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4344,8 +4344,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } err = percpu_ref_init(&css->refcnt, css_release); - if (err) + if (err) { + ss->css_free(cgrp); goto err_free_all; + } init_cgroup_css(css, ss, cgrp); -- cgit v1.2.3 From 2816c551c796ec14620325b2c9ed75b9979d3125 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 29 Jul 2013 19:50:33 +0200 Subject: tracing: trace_remove_event_call() should fail if call/file is in use Change trace_remove_event_call(call) to return the error if this call is active. This is what the callers assume but can't verify outside of the tracing locks. Both trace_kprobe.c/trace_uprobe.c need the additional changes, unregister_trace_probe() should abort if trace_remove_event_call() fails. The caller is going to free this call/file so we must ensure that nobody can use them after trace_remove_event_call() succeeds. debugfs should be fine after the previous changes and event_remove() does TRACE_REG_UNREGISTER, but still there are 2 reasons why we need the additional checks: - There could be a perf_event(s) attached to this tp_event, so the patch checks ->perf_refcount. - TRACE_REG_UNREGISTER can be suppressed by FTRACE_EVENT_FL_SOFT_MODE, so we simply check FTRACE_EVENT_FL_ENABLED protected by event_mutex. Link: http://lkml.kernel.org/r/20130729175033.GB26284@redhat.com Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a67c913e2f9f..ec04836273c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1713,16 +1713,47 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) destroy_preds(call); } +static int probe_remove_event_call(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; + +#ifdef CONFIG_PERF_EVENTS + if (call->perf_refcount) + return -EBUSY; +#endif + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + /* + * We can't rely on ftrace_event_enable_disable(enable => 0) + * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * TRACE_REG_UNREGISTER. + */ + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return -EBUSY; + break; + } while_for_each_event_file(); + + __trace_remove_event_call(call); + + return 0; +} + /* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) +int trace_remove_event_call(struct ftrace_event_call *call) { + int ret; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); down_write(&trace_event_sem); - __trace_remove_event_call(call); + ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + + return ret; } #define for_each_event(event, start, end) \ -- cgit v1.2.3 From 2ba64035d0ca966fd189bc3e0826343fc81bf482 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 31 Jul 2013 13:16:22 -0400 Subject: tracing: Add comment to describe special break case in probe_remove_event_call() The "break" used in the do_for_each_event_file() is used as an optimization as the loop is really a double loop. The loop searches all event files for each trace_array. There's only one matching event file per trace_array and after we find the event file for the trace_array, the break is used to jump to the next trace_array and start the search there. As this is not a standard way of using "break" in C code, it requires a comment right before the break to let people know what is going on. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec04836273c0..29a7ebcfb426 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1732,6 +1732,12 @@ static int probe_remove_event_call(struct ftrace_event_call *call) */ if (file->flags & FTRACE_EVENT_FL_ENABLED) return -EBUSY; + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ break; } while_for_each_event_file(); -- cgit v1.2.3 From 10e84b97ed799be404836dc7f71ab47d4571265a Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 31 Jul 2013 13:53:35 -0700 Subject: mm: sched: numa: fix NUMA balancing when !SCHED_DEBUG Commit 3105b86a9fee ("mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG") defined numabalancing_enabled to control the enabling and disabling of automatic NUMA balancing, but it is never used. I believe the intention was to use this in place of sched_feat_numa(NUMA). Currently, if SCHED_DEBUG is not defined, sched_feat_numa(NUMA) will never be changed from the initial "false". Signed-off-by: Dave Kleikamp Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb456f44b7b1..9565645e3202 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; - if (!sched_feat_numa(NUMA)) + if (!numabalancing_enabled) return; /* FIXME: Allocate task-specific structure for placement policy here */ @@ -5786,7 +5786,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_feat_numa(NUMA)) + if (numabalancing_enabled) task_tick_numa(rq, curr); update_rq_runnable_avg(rq, 1); -- cgit v1.2.3 From b9ee979e9d770dc10f94936ef6ff9efddc23c911 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 13:53:42 -0700 Subject: printk: move to separate directory for easier modification Make it easier to break up printk into bite-sized chunks. Remove printk path/filename from comment. Signed-off-by: Joe Perches Cc: Samuel Thibault Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 3 +- kernel/printk.c | 2924 ------------------------------------------------ kernel/printk/Makefile | 1 + kernel/printk/printk.c | 2924 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2927 insertions(+), 2925 deletions(-) delete mode 100644 kernel/printk.c create mode 100644 kernel/printk/Makefile create mode 100644 kernel/printk/printk.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 470839d1a30e..35ef1185e359 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o printk.o \ +obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ @@ -24,6 +24,7 @@ endif obj-y += sched/ obj-y += power/ +obj-y += printk/ obj-y += cpu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o diff --git a/kernel/printk.c b/kernel/printk.c deleted file mode 100644 index 69b0890ed7e5..000000000000 --- a/kernel/printk.c +++ /dev/null @@ -1,2924 +0,0 @@ -/* - * linux/kernel/printk.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Modified to make sys_syslog() more flexible: added commands to - * return the last 4k of kernel messages, regardless of whether - * they've been read or not. Added option to suppress kernel printk's - * to the console. Added hook for sending the console messages - * elsewhere, in preparation for a serial line console (someday). - * Ted Ts'o, 2/11/93. - * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul - * manfred@colorfullife.com - * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* For in_interrupt() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define CREATE_TRACE_POINTS -#include - -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - -int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ -}; - -/* - * Low level drivers may need that to know if they can schedule in - * their unblank() callback or not. So let's export it. - */ -int oops_in_progress; -EXPORT_SYMBOL(oops_in_progress); - -/* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console - * driver system. - */ -static DEFINE_SEMAPHORE(console_sem); -struct console *console_drivers; -EXPORT_SYMBOL_GPL(console_drivers); - -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_lock_dep_map = { - .name = "console_lock" -}; -#endif - -/* - * This is used for debugging the mess that is the VT code by - * keeping track if we have the console semaphore held. It's - * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held - */ -static int console_locked, console_suspended; - -/* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - -/* - * Array of consoles built from command line options (console=) - */ -struct console_cmdline -{ - char name[8]; /* Name of the driver */ - int index; /* Minor dev. to use */ - char *options; /* Options for the driver */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - char *brl_options; /* Options for braille driver */ -#endif -}; - -#define MAX_CMDLINECONSOLES 8 - -static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int selected_console = -1; -static int preferred_console = -1; -int console_set_on_cmdline; -EXPORT_SYMBOL(console_set_on_cmdline); - -/* Flag: console code may call schedule() */ -static int console_may_schedule; - -/* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. - * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. - * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. - * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. - * - * Examples for well-defined, commonly used property names are: - * DEVICE=b12:8 device identifier - * b12:8 block dev_t - * c127:3 char dev_t - * n8 netdev ifindex - * +sound:card0 subsystem:devname - * SUBSYSTEM=pci driver-core subsystem name - * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. - * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header - * - * The 'struct log' buffer header must never be directly exported to - * userspace, it is a kernel-private implementation detail that might - * need to be changed in the future, when the requirements change. - * - * /dev/kmsg exports the structured data in the following line format: - * "level,sequnum,timestamp;\n" - * - * The optional key/value pairs are attached as continuation lines starting - * with a space character and terminated by a newline. All possible - * non-prinatable characters are escaped in the "\xff" notation. - * - * Users of the export format should ignore possible additional values - * separated by ',', and find the message after the ';' character. - */ - -enum log_flags { - LOG_NOCONS = 1, /* already flushed, do not print to console */ - LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_PREFIX = 4, /* text started with a prefix */ - LOG_CONT = 8, /* text is a fragment of a continuation line */ -}; - -struct log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -}; - -/* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); - */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); - -#ifdef CONFIG_PRINTK -DECLARE_WAIT_QUEUE_HEAD(log_wait); -/* the next printk record to read by syslog(READ) or /proc/kmsg */ -static u64 syslog_seq; -static u32 syslog_idx; -static enum log_flags syslog_prev; -static size_t syslog_partial; - -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; - -/* the next printk record to read after the last 'clear' command */ -static u64 clear_seq; -static u32 clear_idx; - -#define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX - -/* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else -#define LOG_ALIGN __alignof__(struct log) -#endif -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); -static char *log_buf = __log_buf; -static u32 log_buf_len = __LOG_BUF_LEN; - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - -/* human readable text of the record */ -static char *log_text(const struct log *msg) -{ - return (char *)msg + sizeof(struct log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct log *msg) -{ - return (char *)msg + sizeof(struct log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct log *log_from_idx(u32 idx) -{ - struct log *msg = (struct log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct log *msg = (struct log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) -{ - struct log *msg; - u32 size, pad_len; - - /* number of '\0' padding bytes to next message */ - size = sizeof(struct log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; - - while (log_first_seq < log_next_seq) { - u32 free; - - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - if (free > size + sizeof(struct log)) - break; - - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct log)); - log_next_idx = 0; - } - - /* fill message */ - msg = (struct log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; - if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; - else - msg->ts_nsec = local_clock(); - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct log) + text_len + dict_len + pad_len; - - /* insert message */ - log_next_idx += msg->len; - log_next_seq++; -} - -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* - * Unless restricted, we allow "read all" and "get buffer size" - * for everybody. - */ - return type != SYSLOG_ACTION_READ_ALL && - type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* - * For historical reasons, accept CAP_SYS_ADMIN too, with - * a warning. - */ - if (capable(CAP_SYS_ADMIN)) { - pr_warn_once("%s (%d): Attempt to access syslog with " - "CAP_SYS_ADMIN but no CAP_SYSLOG " - "(deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return security_syslog(type); -} - - -/* /dev/kmsg - userspace message inject/listen interface */ -struct devkmsg_user { - u64 seq; - u32 idx; - enum log_flags prev; - struct mutex lock; - char buf[8192]; -}; - -static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) -{ - char *buf, *line; - int i; - int level = default_message_loglevel; - int facility = 1; /* LOG_USER */ - size_t len = iov_length(iv, count); - ssize_t ret = len; - - if (len > LOG_LINE_MAX) - return -EINVAL; - buf = kmalloc(len+1, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; - - line = buf; - for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { - ret = -EFAULT; - goto out; - } - line += iv[i].iov_len; - } - - /* - * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace - * the decimal value represents 32bit, the lower 3 bit are the log - * level, the rest are the log facility. - * - * If no prefix or no userspace facility is specified, we - * enforce LOG_USER, to be able to reliably distinguish - * kernel-generated messages from userspace-injected ones. - */ - line = buf; - if (line[0] == '<') { - char *endp = NULL; - - i = simple_strtoul(line+1, &endp, 10); - if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; - endp++; - len -= endp - line; - line = endp; - } - } - line[len] = '\0'; - - printk_emit(facility, level, NULL, 0, "%s", line); -out: - kfree(buf); - return ret; -} - -static ssize_t devkmsg_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct devkmsg_user *user = file->private_data; - struct log *msg; - u64 ts_usec; - size_t i; - char cont = '-'; - size_t len; - ssize_t ret; - - if (!user) - return -EBADF; - - ret = mutex_lock_interruptible(&user->lock); - if (ret) - return ret; - raw_spin_lock_irq(&logbuf_lock); - while (user->seq == log_next_seq) { - if (file->f_flags & O_NONBLOCK) { - ret = -EAGAIN; - raw_spin_unlock_irq(&logbuf_lock); - goto out; - } - - raw_spin_unlock_irq(&logbuf_lock); - ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); - if (ret) - goto out; - raw_spin_lock_irq(&logbuf_lock); - } - - if (user->seq < log_first_seq) { - /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; - ret = -EPIPE; - raw_spin_unlock_irq(&logbuf_lock); - goto out; - } - - msg = log_from_idx(user->idx); - ts_usec = msg->ts_nsec; - do_div(ts_usec, 1000); - - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; - - len = sprintf(user->buf, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, - user->seq, ts_usec, cont); - user->prev = msg->flags; - - /* escape non-printable characters */ - for (i = 0; i < msg->text_len; i++) { - unsigned char c = log_text(msg)[i]; - - if (c < ' ' || c >= 127 || c == '\\') - len += sprintf(user->buf + len, "\\x%02x", c); - else - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - - if (msg->dict_len) { - bool line = true; - - for (i = 0; i < msg->dict_len; i++) { - unsigned char c = log_dict(msg)[i]; - - if (line) { - user->buf[len++] = ' '; - line = false; - } - - if (c == '\0') { - user->buf[len++] = '\n'; - line = true; - continue; - } - - if (c < ' ' || c >= 127 || c == '\\') { - len += sprintf(user->buf + len, "\\x%02x", c); - continue; - } - - user->buf[len++] = c; - } - user->buf[len++] = '\n'; - } - - user->idx = log_next(user->idx); - user->seq++; - raw_spin_unlock_irq(&logbuf_lock); - - if (len > count) { - ret = -EINVAL; - goto out; - } - - if (copy_to_user(buf, user->buf, len)) { - ret = -EFAULT; - goto out; - } - ret = len; -out: - mutex_unlock(&user->lock); - return ret; -} - -static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) -{ - struct devkmsg_user *user = file->private_data; - loff_t ret = 0; - - if (!user) - return -EBADF; - if (offset) - return -ESPIPE; - - raw_spin_lock_irq(&logbuf_lock); - switch (whence) { - case SEEK_SET: - /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; - break; - case SEEK_DATA: - /* - * The first record after the last SYSLOG_ACTION_CLEAR, - * like issued by 'dmesg -c'. Reading /dev/kmsg itself - * changes no global state, and does not clear anything. - */ - user->idx = clear_idx; - user->seq = clear_seq; - break; - case SEEK_END: - /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; - break; - default: - ret = -EINVAL; - } - raw_spin_unlock_irq(&logbuf_lock); - return ret; -} - -static unsigned int devkmsg_poll(struct file *file, poll_table *wait) -{ - struct devkmsg_user *user = file->private_data; - int ret = 0; - - if (!user) - return POLLERR|POLLNVAL; - - poll_wait(file, &log_wait, wait); - - raw_spin_lock_irq(&logbuf_lock); - if (user->seq < log_next_seq) { - /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) - ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; - else - ret = POLLIN|POLLRDNORM; - } - raw_spin_unlock_irq(&logbuf_lock); - - return ret; -} - -static int devkmsg_open(struct inode *inode, struct file *file) -{ - struct devkmsg_user *user; - int err; - - /* write-only does not need any file context */ - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - return 0; - - err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - if (err) - return err; - - user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); - if (!user) - return -ENOMEM; - - mutex_init(&user->lock); - - raw_spin_lock_irq(&logbuf_lock); - user->idx = log_first_idx; - user->seq = log_first_seq; - raw_spin_unlock_irq(&logbuf_lock); - - file->private_data = user; - return 0; -} - -static int devkmsg_release(struct inode *inode, struct file *file) -{ - struct devkmsg_user *user = file->private_data; - - if (!user) - return 0; - - mutex_destroy(&user->lock); - kfree(user); - return 0; -} - -const struct file_operations kmsg_fops = { - .open = devkmsg_open, - .read = devkmsg_read, - .aio_write = devkmsg_writev, - .llseek = devkmsg_llseek, - .poll = devkmsg_poll, - .release = devkmsg_release, -}; - -#ifdef CONFIG_KEXEC -/* - * This appends the listed symbols to /proc/vmcoreinfo - * - * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to - * obtain access to symbols that are otherwise very difficult to locate. These - * symbols are specifically used so that utilities can access and extract the - * dmesg log from a vmcore file after a crash. - */ -void log_buf_kexec_setup(void) -{ - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(log_next_idx); - /* - * Export struct log size and field offsets. User space tools can - * parse it and detect any changes to structure down the line. - */ - VMCOREINFO_STRUCT_SIZE(log); - VMCOREINFO_OFFSET(log, ts_nsec); - VMCOREINFO_OFFSET(log, len); - VMCOREINFO_OFFSET(log, text_len); - VMCOREINFO_OFFSET(log, dict_len); -} -#endif - -/* requested log_buf_len from kernel cmdline */ -static unsigned long __initdata new_log_buf_len; - -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) -{ - unsigned size = memparse(str, &str); - - if (size) - size = roundup_pow_of_two(size); - if (size > log_buf_len) - new_log_buf_len = size; - - return 0; -} -early_param("log_buf_len", log_buf_len_setup); - -void __init setup_log_buf(int early) -{ - unsigned long flags; - char *new_log_buf; - int free; - - if (!new_log_buf_len) - return; - - if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); - } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); - } - - if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", - new_log_buf_len); - return; - } - - raw_spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = new_log_buf_len; - log_buf = new_log_buf; - new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - pr_info("log_buf_len: %d\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", - free, (free * 100) / __LOG_BUF_LEN); -} - -static bool __read_mostly ignore_loglevel; - -static int __init ignore_loglevel_setup(char *str) -{ - ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); - - return 0; -} - -early_param("ignore_loglevel", ignore_loglevel_setup); -module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); - -#ifdef CONFIG_BOOT_PRINTK_DELAY - -static int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long loops_per_msec; /* based on boot_delay */ - -static int __init boot_delay_setup(char *str) -{ - unsigned long lpj; - - lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ - loops_per_msec = (unsigned long long)lpj / 1000 * HZ; - - get_option(&str, &boot_delay); - if (boot_delay > 10 * 1000) - boot_delay = 0; - - pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, loops_per_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, loops_per_msec); - return 1; -} -__setup("boot_delay=", boot_delay_setup); - -static void boot_delay_msec(int level) -{ - unsigned long long k; - unsigned long timeout; - - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) - || (level >= console_loglevel && !ignore_loglevel)) { - return; - } - - k = (unsigned long long)loops_per_msec * boot_delay; - - timeout = jiffies + msecs_to_jiffies(boot_delay); - while (k) { - k--; - cpu_relax(); - /* - * use (volatile) jiffies to prevent - * compiler reduction; loop termination via jiffies - * is secondary and may or may not happen. - */ - if (time_after(jiffies, timeout)) - break; - touch_nmi_watchdog(); - } -} -#else -static inline void boot_delay_msec(int level) -{ -} -#endif - -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -static size_t print_time(u64 ts, char *buf) -{ - unsigned long rem_nsec; - - if (!printk_time) - return 0; - - rem_nsec = do_div(ts, 1000000000); - - if (!buf) - return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); - - return sprintf(buf, "[%5lu.%06lu] ", - (unsigned long)ts, rem_nsec / 1000); -} - -static size_t print_prefix(const struct log *msg, bool syslog, char *buf) -{ - size_t len = 0; - unsigned int prefix = (msg->facility << 3) | msg->level; - - if (syslog) { - if (buf) { - len += sprintf(buf, "<%u>", prefix); - } else { - len += 3; - if (prefix > 999) - len += 3; - else if (prefix > 99) - len += 2; - else if (prefix > 9) - len++; - } - } - - len += print_time(msg->ts_nsec, buf ? buf + len : NULL); - return len; -} - -static size_t msg_print_text(const struct log *msg, enum log_flags prev, - bool syslog, char *buf, size_t size) -{ - const char *text = log_text(msg); - size_t text_size = msg->text_len; - bool prefix = true; - bool newline = true; - size_t len = 0; - - if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) - prefix = false; - - if (msg->flags & LOG_CONT) { - if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) - prefix = false; - - if (!(msg->flags & LOG_NEWLINE)) - newline = false; - } - - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; - - if (next) { - text_len = next - text; - next++; - text_size -= next - text; - } else { - text_len = text_size; - } - - if (buf) { - if (print_prefix(msg, syslog, NULL) + - text_len + 1 >= size - len) - break; - - if (prefix) - len += print_prefix(msg, syslog, buf + len); - memcpy(buf + len, text, text_len); - len += text_len; - if (next || newline) - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - if (prefix) - len += print_prefix(msg, syslog, NULL); - len += text_len; - if (next || newline) - len++; - } - - prefix = true; - text = next; - } while (text); - - return len; -} - -static int syslog_print(char __user *buf, int size) -{ - char *text; - struct log *msg; - int len = 0; - - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); - if (!text) - return -ENOMEM; - - while (size > 0) { - size_t n; - size_t skip; - - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_prev = 0; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { - raw_spin_unlock_irq(&logbuf_lock); - break; - } - - skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (n - syslog_partial <= size) { - /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; - syslog_prev = msg->flags; - n -= syslog_partial; - syslog_partial = 0; - } else if (!len){ - /* partial read(), remember position */ - n = size; - syslog_partial += n; - } else - n = 0; - raw_spin_unlock_irq(&logbuf_lock); - - if (!n) - break; - - if (copy_to_user(buf, text + skip, n)) { - if (!len) - len = -EFAULT; - break; - } - - len += n; - size -= n; - buf += n; - } - - kfree(text); - return len; -} - -static int syslog_print_all(char __user *buf, int size, bool clear) -{ - char *text; - int len = 0; - - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); - if (!text) - return -ENOMEM; - - raw_spin_lock_irq(&logbuf_lock); - if (buf) { - u64 next_seq; - u64 seq; - u32 idx; - enum log_flags prev; - - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* - * Find first record that fits, including all following records, - * into the user-provided buffer for this dump. - */ - seq = clear_seq; - idx = clear_idx; - prev = 0; - while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); - - len += msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; - idx = log_next(idx); - seq++; - } - - /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - prev = 0; - while (len > size && seq < log_next_seq) { - struct log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; - idx = log_next(idx); - seq++; - } - - /* last message fitting into this dump */ - next_seq = log_next_seq; - - len = 0; - prev = 0; - while (len >= 0 && seq < next_seq) { - struct log *msg = log_from_idx(idx); - int textlen; - - textlen = msg_print_text(msg, prev, true, text, - LOG_LINE_MAX + PREFIX_MAX); - if (textlen < 0) { - len = textlen; - break; - } - idx = log_next(idx); - seq++; - prev = msg->flags; - - raw_spin_unlock_irq(&logbuf_lock); - if (copy_to_user(buf + len, text, textlen)) - len = -EFAULT; - else - len += textlen; - raw_spin_lock_irq(&logbuf_lock); - - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - prev = 0; - } - } - } - - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } - raw_spin_unlock_irq(&logbuf_lock); - - kfree(text); - return len; -} - -int do_syslog(int type, char __user *buf, int len, bool from_file) -{ - bool clear = false; - static int saved_console_loglevel = -1; - int error; - - error = check_syslog_permissions(type, from_file); - if (error) - goto out; - - error = security_syslog(type); - if (error) - return error; - - switch (type) { - case SYSLOG_ACTION_CLOSE: /* Close log */ - break; - case SYSLOG_ACTION_OPEN: /* Open log */ - break; - case SYSLOG_ACTION_READ: /* Read from log */ - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); - if (error) - goto out; - error = syslog_print(buf, len); - break; - /* Read/clear last kernel messages */ - case SYSLOG_ACTION_READ_CLEAR: - clear = true; - /* FALL THRU */ - /* Read last kernel messages */ - case SYSLOG_ACTION_READ_ALL: - error = -EINVAL; - if (!buf || len < 0) - goto out; - error = 0; - if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } - error = syslog_print_all(buf, len, clear); - break; - /* Clear ring buffer */ - case SYSLOG_ACTION_CLEAR: - syslog_print_all(NULL, 0, true); - break; - /* Disable logging to console */ - case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) - saved_console_loglevel = console_loglevel; - console_loglevel = minimum_console_loglevel; - break; - /* Enable logging to console */ - case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { - console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; - } - break; - /* Set level of messages printed to console */ - case SYSLOG_ACTION_CONSOLE_LEVEL: - error = -EINVAL; - if (len < 1 || len > 8) - goto out; - if (len < minimum_console_loglevel) - len = minimum_console_loglevel; - console_loglevel = len; - /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; - error = 0; - break; - /* Number of chars in the log buffer */ - case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_prev = 0; - syslog_partial = 0; - } - if (from_file) { - /* - * Short-cut for poll(/"proc/kmsg") which simply checks - * for pending data, not the size; return the count of - * records, not the length. - */ - error = log_next_idx - syslog_idx; - } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; - enum log_flags prev = syslog_prev; - - error = 0; - while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); - - error += msg_print_text(msg, prev, true, NULL, 0); - idx = log_next(idx); - seq++; - prev = msg->flags; - } - error -= syslog_partial; - } - raw_spin_unlock_irq(&logbuf_lock); - break; - /* Size of the log buffer */ - case SYSLOG_ACTION_SIZE_BUFFER: - error = log_buf_len; - break; - default: - error = -EINVAL; - break; - } -out: - return error; -} - -SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) -{ - return do_syslog(type, buf, len, SYSLOG_FROM_READER); -} - -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. - */ -static void call_console_drivers(int level, const char *text, size_t len) -{ - struct console *con; - - trace_console(text, len); - - if (level >= console_loglevel && !ignore_loglevel) - return; - if (!console_drivers) - return; - - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - con->write(con, text, len); - } -} - -/* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - debug_locks_off(); - /* If a crash is occurring, make sure we can't deadlock */ - raw_spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - sema_init(&console_sem, 1); -} - -/* Check if we have any console registered that can be called early in boot. */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. - */ -static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) -{ - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } - } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) - up(&console_sem); - return retval; -} - -int printk_delay_msec __read_mostly; - -static inline void printk_delay(void) -{ - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; - - while (m--) { - mdelay(1); - touch_nmi_watchdog(); - } - } -} - -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - size_t cons; /* bytes written to console */ - struct task_struct *owner; /* task of first print*/ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ - enum log_flags flags; /* prefix, newline flags */ - bool flushed:1; /* buffer sealed and committed */ -} cont; - -static void cont_flush(enum log_flags flags) -{ - if (cont.flushed) - return; - if (cont.len == 0) - return; - - if (cont.cons) { - /* - * If a fragment of this line was directly flushed to the - * console; wait for the console to pick up the rest of the - * line. LOG_NOCONS suppresses a duplicated output. - */ - log_store(cont.facility, cont.level, flags | LOG_NOCONS, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flags = flags; - cont.flushed = true; - } else { - /* - * If no fragment of this line ever reached the console, - * just submit it to the store and free the buffer. - */ - log_store(cont.facility, cont.level, flags, 0, - NULL, 0, cont.buf, cont.len); - cont.len = 0; - } -} - -static bool cont_add(int facility, int level, const char *text, size_t len) -{ - if (cont.len && cont.flushed) - return false; - - if (cont.len + len > sizeof(cont.buf)) { - /* the line gets too long, split it up in separate records */ - cont_flush(LOG_CONT); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.owner = current; - cont.ts_nsec = local_clock(); - cont.flags = 0; - cont.cons = 0; - cont.flushed = false; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(LOG_CONT); - - return true; -} - -static size_t cont_print_text(char *text, size_t size) -{ - size_t textlen = 0; - size_t len; - - if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { - textlen += print_time(cont.ts_nsec, text); - size -= textlen; - } - - len = cont.len - cont.cons; - if (len > 0) { - if (len+1 > size) - len = size-1; - memcpy(text + textlen, cont.buf + cont.cons, len); - textlen += len; - cont.cons = cont.len; - } - - if (cont.flushed) { - if (cont.flags & LOG_NEWLINE) - text[textlen++] = '\n'; - /* got everything, release buffer */ - cont.len = 0; - } - return textlen; -} - -asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, va_list args) -{ - static int recursion_bug; - static char textbuf[LOG_LINE_MAX]; - char *text = textbuf; - size_t text_len; - enum log_flags lflags = 0; - unsigned long flags; - int this_cpu; - int printed_len = 0; - - boot_delay_msec(level); - printk_delay(); - - /* This stops the holder of console_sem just where we want him */ - local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(logbuf_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; - goto out_restore_irqs; - } - zap_locks(); - } - - lockdep_off(); - raw_spin_lock(&logbuf_lock); - logbuf_cpu = this_cpu; - - if (recursion_bug) { - static const char recursion_msg[] = - "BUG: recent printk recursion!"; - - recursion_bug = 0; - printed_len += strlen(recursion_msg); - /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); - } - - /* - * The printf needs to come first; we need the syslog - * prefix which might be passed-in as a parameter. - */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); - - /* mark and strip a trailing newline */ - if (text_len && text[text_len-1] == '\n') { - text_len--; - lflags |= LOG_NEWLINE; - } - - /* strip kernel syslog prefix and extract log level or control flags */ - if (facility == 0) { - int kern_level = printk_get_level(text); - - if (kern_level) { - const char *end_of_header = printk_skip_level(text); - switch (kern_level) { - case '0' ... '7': - if (level == -1) - level = kern_level - '0'; - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; - } - text_len -= end_of_header - text; - text = (char *)end_of_header; - } - } - - if (level == -1) - level = default_message_loglevel; - - if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; - - if (!(lflags & LOG_NEWLINE)) { - /* - * Flush the conflicting buffer. An earlier newline was missing, - * or another task also prints continuation lines. - */ - if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(LOG_NEWLINE); - - /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); - } else { - bool stored = false; - - /* - * If an earlier newline was missing and it was the same task, - * either merge it with the current buffer and flush, or if - * there was a race with interrupts (prefix == true) then just - * flush it out and store this line separately. - */ - if (cont.len && cont.owner == current) { - if (!(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, text_len); - cont_flush(LOG_NEWLINE); - } - - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); - } - printed_len += text_len; - - /* - * Try to acquire and then immediately release the console semaphore. - * The release will print out buffers and wake up /dev/kmsg and syslog() - * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. - */ - if (console_trylock_for_printk(this_cpu)) - console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); - - return printed_len; -} -EXPORT_SYMBOL(vprintk_emit); - -asmlinkage int vprintk(const char *fmt, va_list args) -{ - return vprintk_emit(0, -1, NULL, 0, fmt, args); -} -EXPORT_SYMBOL(vprintk); - -asmlinkage int printk_emit(int facility, int level, - const char *dict, size_t dictlen, - const char *fmt, ...) -{ - va_list args; - int r; - - va_start(args, fmt); - r = vprintk_emit(facility, level, dict, dictlen, fmt, args); - va_end(args); - - return r; -} -EXPORT_SYMBOL(printk_emit); - -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk(). It can be called from any context. We want it to work. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the - * output and call the console drivers. If we fail to get the semaphore, we - * place the output into the log buffer and return. The current holder of - * the console_sem will notice the new output in console_unlock(); and will - * send it to the consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ -asmlinkage int printk(const char *fmt, ...) -{ - va_list args; - int r; - -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif - va_start(args, fmt); - r = vprintk_emit(0, -1, NULL, 0, fmt, args); - va_end(args); - - return r; -} -EXPORT_SYMBOL(printk); - -#else /* CONFIG_PRINTK */ - -#define LOG_LINE_MAX 0 -#define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 -static u64 syslog_seq; -static u32 syslog_idx; -static u64 console_seq; -static u32 console_idx; -static enum log_flags syslog_prev; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static enum log_flags console_prev; -static struct cont { - size_t len; - size_t cons; - u8 level; - bool flushed:1; -} cont; -static struct log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, enum log_flags prev, - bool syslog, char *buf, size_t size) { return 0; } -static size_t cont_print_text(char *text, size_t size) { return 0; } - -#endif /* CONFIG_PRINTK */ - -#ifdef CONFIG_EARLY_PRINTK -struct console *early_console; - -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - -asmlinkage void early_printk(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - early_vprintk(fmt, ap); - va_end(ap); -} -#endif - -static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - if (!brl_options) - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - if (!brl_options) - selected_console = i; - c = &console_cmdline[i]; - strlcpy(c->name, name, sizeof(c->name)); - c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - c->brl_options = brl_options; -#endif - c->index = idx; - return 0; -} -/* - * Set up a list of consoles. Called from init/main.c - */ -static int __init console_setup(char *str) -{ - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ - char *s, *options, *brl_options = NULL; - int idx; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (!memcmp(str, "brl,", 4)) { - brl_options = ""; - str += 4; - } else if (!memcmp(str, "brl=", 4)) { - brl_options = str + 4; - str = strchr(brl_options, ','); - if (!str) { - printk(KERN_ERR "need port name after brl=\n"); - return 1; - } - *(str++) = 0; - } -#endif - - /* - * Decode str into name, index, options. - */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(buf, "ttyS"); - strncpy(buf + 4, str, sizeof(buf) - 5); - } else { - strncpy(buf, str, sizeof(buf) - 1); - } - buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) - *(options++) = 0; -#ifdef __sparc__ - if (!strcmp(str, "ttya")) - strcpy(buf, "ttyS0"); - if (!strcmp(str, "ttyb")) - strcpy(buf, "ttyS1"); -#endif - for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') - break; - idx = simple_strtoul(s, NULL, 10); - *s = 0; - - __add_preferred_console(buf, idx, options, brl_options); - console_set_on_cmdline = 1; - return 1; -} -__setup("console=", console_setup); - -/** - * add_preferred_console - add a device to the list of preferred consoles. - * @name: device name - * @idx: device index - * @options: options for this console - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int add_preferred_console(char *name, int idx, char *options) -{ - return __add_preferred_console(name, idx, options, NULL); -} - -int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) -{ - struct console_cmdline *c; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - c = &console_cmdline[i]; - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; - } - /* not found */ - return -1; -} - -bool console_suspend_enabled = 1; -EXPORT_SYMBOL(console_suspend_enabled); - -static int __init console_suspend_disable(char *str) -{ - console_suspend_enabled = 0; - return 1; -} -__setup("no_console_suspend", console_suspend_disable); -module_param_named(console_suspend, console_suspend_enabled, - bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(console_suspend, "suspend console during suspend" - " and hibernate operations"); - -/** - * suspend_console - suspend the console subsystem - * - * This disables printk() while we go into suspend states - */ -void suspend_console(void) -{ - if (!console_suspend_enabled) - return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); - console_lock(); - console_suspended = 1; - up(&console_sem); -} - -void resume_console(void) -{ - if (!console_suspend_enabled) - return; - down(&console_sem); - console_suspended = 0; - console_unlock(); -} - -/** - * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused - * - * If printk() is called from a CPU that is not online yet, the messages - * will be spooled but will not show up on the console. This function is - * called when a new CPU comes online (or fails to come up), and ensures - * that any such output gets printed. - */ -static int console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: - console_lock(); - console_unlock(); - } - return NOTIFY_OK; -} - -/** - * console_lock - lock the console system for exclusive use. - * - * Acquires a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. - * - * Can sleep, returns nothing. - */ -void console_lock(void) -{ - might_sleep(); - - down(&console_sem); - if (console_suspended) - return; - console_locked = 1; - console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); -} -EXPORT_SYMBOL(console_lock); - -/** - * console_trylock - try to lock the console system for exclusive use. - * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. - * - * returns 1 on success, and 0 on failure to acquire the lock. - */ -int console_trylock(void) -{ - if (down_trylock(&console_sem)) - return 0; - if (console_suspended) { - up(&console_sem); - return 0; - } - console_locked = 1; - console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); - return 1; -} -EXPORT_SYMBOL(console_trylock); - -int is_console_locked(void) -{ - return console_locked; -} - -static void console_cont_flush(char *text, size_t size) -{ - unsigned long flags; - size_t len; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - - if (!cont.len) - goto out; - - /* - * We still queue earlier records, likely because the console was - * busy. The earlier ones need to be printed before this one, we - * did not flush any fragment so far, so just let it queue up. - */ - if (console_seq < log_next_seq && !cont.cons) - goto out; - - len = cont_print_text(text, size); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - return; -out: - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -} - -/** - * console_unlock - unlock the console system - * - * Releases the console_lock which the caller holds on the console system - * and the console driver list. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * If there is output waiting, we wake /dev/kmsg and syslog() users. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) -{ - static char text[LOG_LINE_MAX + PREFIX_MAX]; - static u64 seen_seq; - unsigned long flags; - bool wake_klogd = false; - bool retry; - - if (console_suspended) { - up(&console_sem); - return; - } - - console_may_schedule = 0; - - /* flush buffered message fragment immediately to console */ - console_cont_flush(text, sizeof(text)); -again: - for (;;) { - struct log *msg; - size_t len; - int level; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (seen_seq != log_next_seq) { - wake_klogd = true; - seen_seq = log_next_seq; - } - - if (console_seq < log_first_seq) { - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - console_prev = 0; - } -skip: - if (console_seq == log_next_seq) - break; - - msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it. - */ - console_idx = log_next(console_idx); - console_seq++; - /* - * We will get here again when we register a new - * CON_PRINTBUFFER console. Clear the flag so we - * will properly dump everything later. - */ - msg->flags &= ~LOG_NOCONS; - console_prev = msg->flags; - goto skip; - } - - level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); - console_idx = log_next(console_idx); - console_seq++; - console_prev = msg->flags; - raw_spin_unlock(&logbuf_lock); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } - console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); - - /* Release the exclusive_console once it is used */ - if (unlikely(exclusive_console)) - exclusive_console = NULL; - - raw_spin_unlock(&logbuf_lock); - - up(&console_sem); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - if (retry && console_trylock()) - goto again; - - if (wake_klogd) - wake_up_klogd(); -} -EXPORT_SYMBOL(console_unlock); - -/** - * console_conditional_schedule - yield the CPU if required - * - * If the console code is currently allowed to sleep, and - * if this CPU should yield the CPU to another task, do - * so here. - * - * Must be called within console_lock();. - */ -void __sched console_conditional_schedule(void) -{ - if (console_may_schedule) - cond_resched(); -} -EXPORT_SYMBOL(console_conditional_schedule); - -void console_unblank(void) -{ - struct console *c; - - /* - * console_unblank can no longer be called in interrupt context unless - * oops_in_progress is set to 1.. - */ - if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) - return; - } else - console_lock(); - - console_locked = 1; - console_may_schedule = 0; - for_each_console(c) - if ((c->flags & CON_ENABLED) && c->unblank) - c->unblank(); - console_unlock(); -} - -/* - * Return the console tty driver structure and its associated index - */ -struct tty_driver *console_device(int *index) -{ - struct console *c; - struct tty_driver *driver = NULL; - - console_lock(); - for_each_console(c) { - if (!c->device) - continue; - driver = c->device(c, index); - if (driver) - break; - } - console_unlock(); - return driver; -} - -/* - * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can - * re-enable output afterwards. - */ -void console_stop(struct console *console) -{ - console_lock(); - console->flags &= ~CON_ENABLED; - console_unlock(); -} -EXPORT_SYMBOL(console_stop); - -void console_start(struct console *console) -{ - console_lock(); - console->flags |= CON_ENABLED; - console_unlock(); -} -EXPORT_SYMBOL(console_start); - -static int __read_mostly keep_bootcon; - -static int __init keep_bootcon_setup(char *str) -{ - keep_bootcon = 1; - printk(KERN_INFO "debug: skip boot console de-registration.\n"); - - return 0; -} - -early_param("keep_bootcon", keep_bootcon_setup); - -/* - * The console driver calls this routine during kernel initialization - * to register the console printing procedure with printk() and to - * print any messages that were printed by the kernel before the - * console driver was initialized. - * - * This can happen pretty early during the boot process (because of - * early_printk) - sometimes before setup_arch() completes - be careful - * of what kernel features are used - they may not be initialised yet. - * - * There are two types of consoles - bootconsoles (early_printk) and - * "real" consoles (everything which is not a bootconsole) which are - * handled differently. - * - Any number of bootconsoles can be registered at any time. - * - As soon as a "real" console is registered, all bootconsoles - * will be unregistered automatically. - * - Once a "real" console is registered, any attempt to register a - * bootconsoles will be rejected - */ -void register_console(struct console *newcon) -{ - int i; - unsigned long flags; - struct console *bcon = NULL; - - /* - * before we register a new CON_BOOT console, make sure we don't - * already have a valid console - */ - if (console_drivers && newcon->flags & CON_BOOT) { - /* find the last or real console */ - for_each_console(bcon) { - if (!(bcon->flags & CON_BOOT)) { - printk(KERN_INFO "Too late to register bootconsole %s%d\n", - newcon->name, newcon->index); - return; - } - } - } - - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; - - if (preferred_console < 0 || bcon || !console_drivers) - preferred_console = selected_console; - - if (newcon->early_setup) - newcon->early_setup(); - - /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. - */ - if (preferred_console < 0) { - if (newcon->index < 0) - newcon->index = 0; - if (newcon->setup == NULL || - newcon->setup(newcon, NULL) == 0) { - newcon->flags |= CON_ENABLED; - if (newcon->device) { - newcon->flags |= CON_CONSDEV; - preferred_console = 0; - } - } - } - - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; - i++) { - if (strcmp(console_cmdline[i].name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != console_cmdline[i].index) - continue; - if (newcon->index < 0) - newcon->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console_cmdline[i].brl_options) { - newcon->flags |= CON_BRL; - braille_register_console(newcon, - console_cmdline[i].index, - console_cmdline[i].options, - console_cmdline[i].brl_options); - return; - } -#endif - if (newcon->setup && - newcon->setup(newcon, console_cmdline[i].options) != 0) - break; - newcon->flags |= CON_ENABLED; - newcon->index = console_cmdline[i].index; - if (i == selected_console) { - newcon->flags |= CON_CONSDEV; - preferred_console = selected_console; - } - break; - } - - if (!(newcon->flags & CON_ENABLED)) - return; - - /* - * If we have a bootconsole, and are switching to a real console, - * don't print everything out again, since when the boot console, and - * the real console are the same physical device, it's annoying to - * see the beginning boot messages twice - */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) - newcon->flags &= ~CON_PRINTBUFFER; - - /* - * Put this console in the list - keep the - * preferred driver at the head of the list. - */ - console_lock(); - if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { - newcon->next = console_drivers; - console_drivers = newcon; - if (newcon->next) - newcon->next->flags &= ~CON_CONSDEV; - } else { - newcon->next = console_drivers->next; - console_drivers->next = newcon; - } - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - console_seq = syslog_seq; - console_idx = syslog_idx; - console_prev = syslog_prev; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - /* - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - */ - exclusive_console = newcon; - } - console_unlock(); - console_sysfs_notify(); - - /* - * By unregistering the bootconsoles after we enable the real console - * we get the "console xxx enabled" message on all the consoles - - * boot consoles, real consoles, etc - this is to ensure that end - * users know there might be something in the kernel's log buffer that - * went to the bootconsole (that they do not see on the real console) - */ - if (bcon && - ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && - !keep_bootcon) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) - */ - printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", - newcon->name, newcon->index); - for_each_console(bcon) - if (bcon->flags & CON_BOOT) - unregister_console(bcon); - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); - } -} -EXPORT_SYMBOL(register_console); - -int unregister_console(struct console *console) -{ - struct console *a, *b; - int res = 1; - -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console->flags & CON_BRL) - return braille_unregister_console(console); -#endif - - console_lock(); - if (console_drivers == console) { - console_drivers=console->next; - res = 0; - } else if (console_drivers) { - for (a=console_drivers->next, b=console_drivers ; - a; b=a, a=b->next) { - if (a == console) { - b->next = a->next; - res = 0; - break; - } - } - } - - /* - * If this isn't the last console and it has CON_CONSDEV set, we - * need to set it on the next preferred console. - */ - if (console_drivers != NULL && console->flags & CON_CONSDEV) - console_drivers->flags |= CON_CONSDEV; - - console_unlock(); - console_sysfs_notify(); - return res; -} -EXPORT_SYMBOL(unregister_console); - -static int __init printk_late_init(void) -{ - struct console *con; - - for_each_console(con) { - if (!keep_bootcon && con->flags & CON_BOOT) { - printk(KERN_INFO "turn off boot console %s%d\n", - con->name, con->index); - unregister_console(con); - } - } - hotcpu_notifier(console_cpu_notify, 0); - return 0; -} -late_initcall(printk_late_init); - -#if defined CONFIG_PRINTK -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - -int printk_sched(const char *fmt, ...) -{ - unsigned long flags; - va_list args; - char *buf; - int r; - - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); - va_end(args); - - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); - - return r; -} - -/* - * printk rate limiting, lifted from the networking subsystem. - * - * This enforces a rate limit: not more than 10 kernel messages - * every 5s to make a denial-of-service attack impossible. - */ -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); - -int __printk_ratelimit(const char *func) -{ - return ___ratelimit(&printk_ratelimit_state, func); -} -EXPORT_SYMBOL(__printk_ratelimit); - -/** - * printk_timed_ratelimit - caller-controlled printk ratelimiting - * @caller_jiffies: pointer to caller's state - * @interval_msecs: minimum interval between prints - * - * printk_timed_ratelimit() returns true if more than @interval_msecs - * milliseconds have elapsed since the last time printk_timed_ratelimit() - * returned true. - */ -bool printk_timed_ratelimit(unsigned long *caller_jiffies, - unsigned int interval_msecs) -{ - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; -} -EXPORT_SYMBOL(printk_timed_ratelimit); - -static DEFINE_SPINLOCK(dump_list_lock); -static LIST_HEAD(dump_list); - -/** - * kmsg_dump_register - register a kernel log dumper. - * @dumper: pointer to the kmsg_dumper structure - * - * Adds a kernel log dumper to the system. The dump callback in the - * structure will be called when the kernel oopses or panics and must be - * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. - */ -int kmsg_dump_register(struct kmsg_dumper *dumper) -{ - unsigned long flags; - int err = -EBUSY; - - /* The dump callback needs to be set */ - if (!dumper->dump) - return -EINVAL; - - spin_lock_irqsave(&dump_list_lock, flags); - /* Don't allow registering multiple times */ - if (!dumper->registered) { - dumper->registered = 1; - list_add_tail_rcu(&dumper->list, &dump_list); - err = 0; - } - spin_unlock_irqrestore(&dump_list_lock, flags); - - return err; -} -EXPORT_SYMBOL_GPL(kmsg_dump_register); - -/** - * kmsg_dump_unregister - unregister a kmsg dumper. - * @dumper: pointer to the kmsg_dumper structure - * - * Removes a dump device from the system. Returns zero on success and - * %-EINVAL otherwise. - */ -int kmsg_dump_unregister(struct kmsg_dumper *dumper) -{ - unsigned long flags; - int err = -EINVAL; - - spin_lock_irqsave(&dump_list_lock, flags); - if (dumper->registered) { - dumper->registered = 0; - list_del_rcu(&dumper->list); - err = 0; - } - spin_unlock_irqrestore(&dump_list_lock, flags); - synchronize_rcu(); - - return err; -} -EXPORT_SYMBOL_GPL(kmsg_dump_unregister); - -static bool always_kmsg_dump; -module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); - -/** - * kmsg_dump - dump kernel log to kernel message dumpers. - * @reason: the reason (oops, panic etc) for dumping - * - * Call each of the registered dumper's dump() callback, which can - * retrieve the kmsg records with kmsg_dump_get_line() or - * kmsg_dump_get_buffer(). - */ -void kmsg_dump(enum kmsg_dump_reason reason) -{ - struct kmsg_dumper *dumper; - unsigned long flags; - - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) - continue; - - /* initialize iterator with data about the stored records */ - dumper->active = true; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - /* invoke dumper which will iterate over records */ - dumper->dump(dumper, reason); - - /* reset iterator */ - dumper->active = false; - } - rcu_read_unlock(); -} - -/** - * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) - * @dumper: registered kmsg dumper - * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to - * @size: maximum size of the buffer - * @len: length of line placed into buffer - * - * Start at the beginning of the kmsg buffer, with the oldest kmsg - * record, and copy one record into the provided buffer. - * - * Consecutive calls will return the next available record moving - * towards the end of the buffer with the youngest messages. - * - * A return value of FALSE indicates that there are no more records to - * read. - * - * The function is similar to kmsg_dump_get_line(), but grabs no locks. - */ -bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) -{ - struct log *msg; - size_t l = 0; - bool ret = false; - - if (!dumper->active) - goto out; - - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; - - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, 0, syslog, line, size); - - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; - ret = true; -out: - if (len) - *len = l; - return ret; -} - -/** - * kmsg_dump_get_line - retrieve one kmsg log line - * @dumper: registered kmsg dumper - * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to - * @size: maximum size of the buffer - * @len: length of line placed into buffer - * - * Start at the beginning of the kmsg buffer, with the oldest kmsg - * record, and copy one record into the provided buffer. - * - * Consecutive calls will return the next available record moving - * towards the end of the buffer with the youngest messages. - * - * A return value of FALSE indicates that there are no more records to - * read. - */ -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) -{ - unsigned long flags; - bool ret; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(kmsg_dump_get_line); - -/** - * kmsg_dump_get_buffer - copy kmsg log lines - * @dumper: registered kmsg dumper - * @syslog: include the "<4>" prefixes - * @buf: buffer to copy the line to - * @size: maximum size of the buffer - * @len: length of line placed into buffer - * - * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. - * If the buffer is large enough, all available kmsg records will be - * copied with a single call. - * - * Consecutive calls will fill the buffer with the next block of - * available older records, not including the earlier retrieved ones. - * - * A return value of FALSE indicates that there are no more records to - * read. - */ -bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, - char *buf, size_t size, size_t *len) -{ - unsigned long flags; - u64 seq; - u32 idx; - u64 next_seq; - u32 next_idx; - enum log_flags prev; - size_t l = 0; - bool ret = false; - - if (!dumper->active) - goto out; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= dumper->next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - - /* calculate length of entire buffer */ - seq = dumper->cur_seq; - idx = dumper->cur_idx; - prev = 0; - while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); - - l += msg_print_text(msg, prev, true, NULL, 0); - idx = log_next(idx); - seq++; - prev = msg->flags; - } - - /* move first record forward until length fits into the buffer */ - seq = dumper->cur_seq; - idx = dumper->cur_idx; - prev = 0; - while (l > size && seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, prev, true, NULL, 0); - idx = log_next(idx); - seq++; - prev = msg->flags; - } - - /* last message in next interation */ - next_seq = seq; - next_idx = idx; - - l = 0; - prev = 0; - while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); - - l += msg_print_text(msg, prev, syslog, buf + l, size - l); - idx = log_next(idx); - seq++; - prev = msg->flags; - } - - dumper->next_seq = next_seq; - dumper->next_idx = next_idx; - ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -out: - if (len) - *len = l; - return ret; -} -EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - -/** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) - * @dumper: registered kmsg dumper - * - * Reset the dumper's iterator so that kmsg_dump_get_line() and - * kmsg_dump_get_buffer() can be called again and used multiple - * times within the same dumper.dump() callback. - * - * The function is similar to kmsg_dump_rewind(), but grabs no locks. - */ -void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) -{ - dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; -} - -/** - * kmsg_dump_rewind - reset the interator - * @dumper: registered kmsg dumper - * - * Reset the dumper's iterator so that kmsg_dump_get_line() and - * kmsg_dump_get_buffer() can be called again and used multiple - * times within the same dumper.dump() callback. - */ -void kmsg_dump_rewind(struct kmsg_dumper *dumper) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - kmsg_dump_rewind_nolock(dumper); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -} -EXPORT_SYMBOL_GPL(kmsg_dump_rewind); - -static char dump_stack_arch_desc_str[128]; - -/** - * dump_stack_set_arch_desc - set arch-specific str to show with task dumps - * @fmt: printf-style format string - * @...: arguments for the format string - * - * The configured string will be printed right after utsname during task - * dumps. Usually used to add arch-specific system identifiers. If an - * arch wants to make use of such an ID string, it should initialize this - * as soon as possible during boot. - */ -void __init dump_stack_set_arch_desc(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), - fmt, args); - va_end(args); -} - -/** - * dump_stack_print_info - print generic debug info for dump_stack() - * @log_lvl: log level - * - * Arch-specific dump_stack() implementations can use this function to - * print out the same debug information as the generic dump_stack(). - */ -void dump_stack_print_info(const char *log_lvl) -{ - printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", - log_lvl, raw_smp_processor_id(), current->pid, current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - if (dump_stack_arch_desc_str[0] != '\0') - printk("%sHardware name: %s\n", - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -} - -/** - * show_regs_print_info - print generic debug info for show_regs() - * @log_lvl: log level - * - * show_regs() implementations can use this function to print out generic - * debug information. - */ -void show_regs_print_info(const char *log_lvl) -{ - dump_stack_print_info(log_lvl); - - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); -} - -#endif diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile new file mode 100644 index 000000000000..36d306d9273c --- /dev/null +++ b/kernel/printk/Makefile @@ -0,0 +1 @@ +obj-y = printk.o diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c new file mode 100644 index 000000000000..69b0890ed7e5 --- /dev/null +++ b/kernel/printk/printk.c @@ -0,0 +1,2924 @@ +/* + * linux/kernel/printk.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Modified to make sys_syslog() more flexible: added commands to + * return the last 4k of kernel messages, regardless of whether + * they've been read or not. Added option to suppress kernel printk's + * to the console. Added hook for sending the console messages + * elsewhere, in preparation for a serial line console (someday). + * Ted Ts'o, 2/11/93. + * Modified for sysctl support, 1/8/97, Chris Horn. + * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * manfred@colorfullife.com + * Rewrote bits to get rid of console_lock + * 01Mar01 Andrew Morton + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For in_interrupt() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS +#include + +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ +#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ + +int console_printk[4] = { + DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ + DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ +}; + +/* + * Low level drivers may need that to know if they can schedule in + * their unblank() callback or not. So let's export it. + */ +int oops_in_progress; +EXPORT_SYMBOL(oops_in_progress); + +/* + * console_sem protects the console_drivers list, and also + * provides serialisation for access to the entire console + * driver system. + */ +static DEFINE_SEMAPHORE(console_sem); +struct console *console_drivers; +EXPORT_SYMBOL_GPL(console_drivers); + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + +/* + * This is used for debugging the mess that is the VT code by + * keeping track if we have the console semaphore held. It's + * definitely not the perfect debug tool (we don't know if _WE_ + * hold it are racing, but it helps tracking those weird code + * path in the console code where we end up in places I want + * locked without the console sempahore held + */ +static int console_locked, console_suspended; + +/* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* + * Array of consoles built from command line options (console=) + */ +struct console_cmdline +{ + char name[8]; /* Name of the driver */ + int index; /* Minor dev. to use */ + char *options; /* Options for the driver */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + char *brl_options; /* Options for braille driver */ +#endif +}; + +#define MAX_CMDLINECONSOLES 8 + +static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +static int selected_console = -1; +static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); + +/* Flag: console code may call schedule() */ +static int console_may_schedule; + +/* + * The printk log buffer consists of a chain of concatenated variable + * length records. Every record starts with a record header, containing + * the overall length of the record. + * + * The heads to the first and last entry in the buffer, as well as the + * sequence numbers of these both entries are maintained when messages + * are stored.. + * + * If the heads indicate available messages, the length in the header + * tells the start next message. A length == 0 for the next message + * indicates a wrap-around to the beginning of the buffer. + * + * Every record carries the monotonic timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual + * kernel messages use LOG_KERN; userspace-injected messages always carry + * a matching syslog facility, by default LOG_USER. The origin of every + * message can be reliably determined that way. + * + * The human readable log message directly follows the message header. The + * length of the message text is stored in the header, the stored message + * is not terminated. + * + * Optionally, a message can carry a dictionary of properties (key/value pairs), + * to provide userspace with a machine-readable message context. + * + * Examples for well-defined, commonly used property names are: + * DEVICE=b12:8 device identifier + * b12:8 block dev_t + * c127:3 char dev_t + * n8 netdev ifindex + * +sound:card0 subsystem:devname + * SUBSYSTEM=pci driver-core subsystem name + * + * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value + * follows directly after a '=' character. Every property is terminated by + * a '\0' character. The last property is not terminated. + * + * Example of a message structure: + * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec + * 0008 34 00 record is 52 bytes long + * 000a 0b 00 text is 11 bytes long + * 000c 1f 00 dictionary is 23 bytes long + * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) + * 0010 69 74 27 73 20 61 20 6c "it's a l" + * 69 6e 65 "ine" + * 001b 44 45 56 49 43 "DEVIC" + * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" + * 52 49 56 45 52 3d 62 75 "RIVER=bu" + * 67 "g" + * 0032 00 00 00 padding to next message header + * + * The 'struct log' buffer header must never be directly exported to + * userspace, it is a kernel-private implementation detail that might + * need to be changed in the future, when the requirements change. + * + * /dev/kmsg exports the structured data in the following line format: + * "level,sequnum,timestamp;\n" + * + * The optional key/value pairs are attached as continuation lines starting + * with a space character and terminated by a newline. All possible + * non-prinatable characters are escaped in the "\xff" notation. + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. + */ + +enum log_flags { + LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_PREFIX = 4, /* text started with a prefix */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ +}; + +struct log { + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 len; /* length of entire record */ + u16 text_len; /* length of text buffer */ + u16 dict_len; /* length of dictionary buffer */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ +}; + +/* + * The logbuf_lock protects kmsg buffer, indices, counters. It is also + * used in interesting ways to provide interlocking in console_unlock(); + */ +static DEFINE_RAW_SPINLOCK(logbuf_lock); + +#ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); +/* the next printk record to read by syslog(READ) or /proc/kmsg */ +static u64 syslog_seq; +static u32 syslog_idx; +static enum log_flags syslog_prev; +static size_t syslog_partial; + +/* index and sequence number of the first record stored in the buffer */ +static u64 log_first_seq; +static u32 log_first_idx; + +/* index and sequence number of the next record to store in the buffer */ +static u64 log_next_seq; +static u32 log_next_idx; + +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + +/* the next printk record to read after the last 'clear' command */ +static u64 clear_seq; +static u32 clear_idx; + +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX + +/* record buffer */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define LOG_ALIGN 4 +#else +#define LOG_ALIGN __alignof__(struct log) +#endif +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); +static char *log_buf = __log_buf; +static u32 log_buf_len = __LOG_BUF_LEN; + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int logbuf_cpu = UINT_MAX; + +/* human readable text of the record */ +static char *log_text(const struct log *msg) +{ + return (char *)msg + sizeof(struct log); +} + +/* optional key/value pair dictionary attached to the record */ +static char *log_dict(const struct log *msg) +{ + return (char *)msg + sizeof(struct log) + msg->text_len; +} + +/* get record by index; idx must point to valid msg */ +static struct log *log_from_idx(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer. + */ + if (!msg->len) + return (struct log *)log_buf; + return msg; +} + +/* get next record; idx must point to valid msg */ +static u32 log_next(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* length == 0 indicates the end of the buffer; wrap */ + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer as *this* one, and + * return the one after that. + */ + if (!msg->len) { + msg = (struct log *)log_buf; + return msg->len; + } + return idx + msg->len; +} + +/* insert record into the buffer, discard old ones, update heads */ +static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) +{ + struct log *msg; + u32 size, pad_len; + + /* number of '\0' padding bytes to next message */ + size = sizeof(struct log) + text_len + dict_len; + pad_len = (-size) & (LOG_ALIGN - 1); + size += pad_len; + + while (log_first_seq < log_next_seq) { + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + if (free > size + sizeof(struct log)) + break; + + /* drop old messages until we have enough contiuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } + + if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { + /* + * This message + an additional empty header does not fit + * at the end of the buffer. Add an empty header with len == 0 + * to signify a wrap around. + */ + memset(log_buf + log_next_idx, 0, sizeof(struct log)); + log_next_idx = 0; + } + + /* fill message */ + msg = (struct log *)(log_buf + log_next_idx); + memcpy(log_text(msg), text, text_len); + msg->text_len = text_len; + memcpy(log_dict(msg), dict, dict_len); + msg->dict_len = dict_len; + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); + memset(log_dict(msg) + dict_len, 0, pad_len); + msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + + /* insert message */ + log_next_idx += msg->len; + log_next_seq++; +} + +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + +/* /dev/kmsg - userspace message inject/listen interface */ +struct devkmsg_user { + u64 seq; + u32 idx; + enum log_flags prev; + struct mutex lock; + char buf[8192]; +}; + +static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + char *buf, *line; + int i; + int level = default_message_loglevel; + int facility = 1; /* LOG_USER */ + size_t len = iov_length(iv, count); + ssize_t ret = len; + + if (len > LOG_LINE_MAX) + return -EINVAL; + buf = kmalloc(len+1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + line = buf; + for (i = 0; i < count; i++) { + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { + ret = -EFAULT; + goto out; + } + line += iv[i].iov_len; + } + + /* + * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace + * the decimal value represents 32bit, the lower 3 bit are the log + * level, the rest are the log facility. + * + * If no prefix or no userspace facility is specified, we + * enforce LOG_USER, to be able to reliably distinguish + * kernel-generated messages from userspace-injected ones. + */ + line = buf; + if (line[0] == '<') { + char *endp = NULL; + + i = simple_strtoul(line+1, &endp, 10); + if (endp && endp[0] == '>') { + level = i & 7; + if (i >> 3) + facility = i >> 3; + endp++; + len -= endp - line; + line = endp; + } + } + line[len] = '\0'; + + printk_emit(facility, level, NULL, 0, "%s", line); +out: + kfree(buf); + return ret; +} + +static ssize_t devkmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct devkmsg_user *user = file->private_data; + struct log *msg; + u64 ts_usec; + size_t i; + char cont = '-'; + size_t len; + ssize_t ret; + + if (!user) + return -EBADF; + + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; + raw_spin_lock_irq(&logbuf_lock); + while (user->seq == log_next_seq) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + raw_spin_unlock_irq(&logbuf_lock); + goto out; + } + + raw_spin_unlock_irq(&logbuf_lock); + ret = wait_event_interruptible(log_wait, + user->seq != log_next_seq); + if (ret) + goto out; + raw_spin_lock_irq(&logbuf_lock); + } + + if (user->seq < log_first_seq) { + /* our last seen message is gone, return error and reset */ + user->idx = log_first_idx; + user->seq = log_first_seq; + ret = -EPIPE; + raw_spin_unlock_irq(&logbuf_lock); + goto out; + } + + msg = log_from_idx(user->idx); + ts_usec = msg->ts_nsec; + do_div(ts_usec, 1000); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; + + /* escape non-printable characters */ + for (i = 0; i < msg->text_len; i++) { + unsigned char c = log_text(msg)[i]; + + if (c < ' ' || c >= 127 || c == '\\') + len += sprintf(user->buf + len, "\\x%02x", c); + else + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + + if (msg->dict_len) { + bool line = true; + + for (i = 0; i < msg->dict_len; i++) { + unsigned char c = log_dict(msg)[i]; + + if (line) { + user->buf[len++] = ' '; + line = false; + } + + if (c == '\0') { + user->buf[len++] = '\n'; + line = true; + continue; + } + + if (c < ' ' || c >= 127 || c == '\\') { + len += sprintf(user->buf + len, "\\x%02x", c); + continue; + } + + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + } + + user->idx = log_next(user->idx); + user->seq++; + raw_spin_unlock_irq(&logbuf_lock); + + if (len > count) { + ret = -EINVAL; + goto out; + } + + if (copy_to_user(buf, user->buf, len)) { + ret = -EFAULT; + goto out; + } + ret = len; +out: + mutex_unlock(&user->lock); + return ret; +} + +static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +{ + struct devkmsg_user *user = file->private_data; + loff_t ret = 0; + + if (!user) + return -EBADF; + if (offset) + return -ESPIPE; + + raw_spin_lock_irq(&logbuf_lock); + switch (whence) { + case SEEK_SET: + /* the first record */ + user->idx = log_first_idx; + user->seq = log_first_seq; + break; + case SEEK_DATA: + /* + * The first record after the last SYSLOG_ACTION_CLEAR, + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ + user->idx = clear_idx; + user->seq = clear_seq; + break; + case SEEK_END: + /* after the last record */ + user->idx = log_next_idx; + user->seq = log_next_seq; + break; + default: + ret = -EINVAL; + } + raw_spin_unlock_irq(&logbuf_lock); + return ret; +} + +static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +{ + struct devkmsg_user *user = file->private_data; + int ret = 0; + + if (!user) + return POLLERR|POLLNVAL; + + poll_wait(file, &log_wait, wait); + + raw_spin_lock_irq(&logbuf_lock); + if (user->seq < log_next_seq) { + /* return error when data has vanished underneath us */ + if (user->seq < log_first_seq) + ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + else + ret = POLLIN|POLLRDNORM; + } + raw_spin_unlock_irq(&logbuf_lock); + + return ret; +} + +static int devkmsg_open(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user; + int err; + + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + return 0; + + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + if (err) + return err; + + user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); + if (!user) + return -ENOMEM; + + mutex_init(&user->lock); + + raw_spin_lock_irq(&logbuf_lock); + user->idx = log_first_idx; + user->seq = log_first_seq; + raw_spin_unlock_irq(&logbuf_lock); + + file->private_data = user; + return 0; +} + +static int devkmsg_release(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user = file->private_data; + + if (!user) + return 0; + + mutex_destroy(&user->lock); + kfree(user); + return 0; +} + +const struct file_operations kmsg_fops = { + .open = devkmsg_open, + .read = devkmsg_read, + .aio_write = devkmsg_writev, + .llseek = devkmsg_llseek, + .poll = devkmsg_poll, + .release = devkmsg_release, +}; + +#ifdef CONFIG_KEXEC +/* + * This appends the listed symbols to /proc/vmcoreinfo + * + * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * obtain access to symbols that are otherwise very difficult to locate. These + * symbols are specifically used so that utilities can access and extract the + * dmesg log from a vmcore file after a crash. + */ +void log_buf_kexec_setup(void) +{ + VMCOREINFO_SYMBOL(log_buf); + VMCOREINFO_SYMBOL(log_buf_len); + VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); +} +#endif + +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + if (size) + size = roundup_pow_of_two(size); + if (size > log_buf_len) + new_log_buf_len = size; + + return 0; +} +early_param("log_buf_len", log_buf_len_setup); + +void __init setup_log_buf(int early) +{ + unsigned long flags; + char *new_log_buf; + int free; + + if (!new_log_buf_len) + return; + + if (early) { + unsigned long mem; + + mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); + if (!mem) + return; + new_log_buf = __va(mem); + } else { + new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + } + + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %ld bytes not available\n", + new_log_buf_len); + return; + } + + raw_spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; + free = __LOG_BUF_LEN - log_next_idx; + memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("early log buf free: %d(%d%%)\n", + free, (free * 100) / __LOG_BUF_LEN); +} + +static bool __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); +module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" + "print all kernel messages to the console."); + +#ifdef CONFIG_BOOT_PRINTK_DELAY + +static int boot_delay; /* msecs delay after each printk during bootup */ +static unsigned long long loops_per_msec; /* based on boot_delay */ + +static int __init boot_delay_setup(char *str) +{ + unsigned long lpj; + + lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ + loops_per_msec = (unsigned long long)lpj / 1000 * HZ; + + get_option(&str, &boot_delay); + if (boot_delay > 10 * 1000) + boot_delay = 0; + + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); + return 1; +} +__setup("boot_delay=", boot_delay_setup); + +static void boot_delay_msec(int level) +{ + unsigned long long k; + unsigned long timeout; + + if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + || (level >= console_loglevel && !ignore_loglevel)) { + return; + } + + k = (unsigned long long)loops_per_msec * boot_delay; + + timeout = jiffies + msecs_to_jiffies(boot_delay); + while (k) { + k--; + cpu_relax(); + /* + * use (volatile) jiffies to prevent + * compiler reduction; loop termination via jiffies + * is secondary and may or may not happen. + */ + if (time_after(jiffies, timeout)) + break; + touch_nmi_watchdog(); + } +} +#else +static inline void boot_delay_msec(int level) +{ +} +#endif + +#if defined(CONFIG_PRINTK_TIME) +static bool printk_time = 1; +#else +static bool printk_time; +#endif +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + rem_nsec = do_div(ts, 1000000000); + + if (!buf) + return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts); + + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + +static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +{ + size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; + + if (syslog) { + if (buf) { + len += sprintf(buf, "<%u>", prefix); + } else { + len += 3; + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) + len++; + } + } + + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); + return len; +} + +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) +{ + const char *text = log_text(msg); + size_t text_size = msg->text_len; + bool prefix = true; + bool newline = true; + size_t len = 0; + + if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) + prefix = false; + + if (msg->flags & LOG_CONT) { + if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) + prefix = false; + + if (!(msg->flags & LOG_NEWLINE)) + newline = false; + } + + do { + const char *next = memchr(text, '\n', text_size); + size_t text_len; + + if (next) { + text_len = next - text; + next++; + text_size -= next - text; + } else { + text_len = text_size; + } + + if (buf) { + if (print_prefix(msg, syslog, NULL) + + text_len + 1 >= size - len) + break; + + if (prefix) + len += print_prefix(msg, syslog, buf + len); + memcpy(buf + len, text, text_len); + len += text_len; + if (next || newline) + buf[len++] = '\n'; + } else { + /* SYSLOG_ACTION_* buffer size only calculation */ + if (prefix) + len += print_prefix(msg, syslog, NULL); + len += text_len; + if (next || newline) + len++; + } + + prefix = true; + text = next; + } while (text); + + return len; +} + +static int syslog_print(char __user *buf, int size) +{ + char *text; + struct log *msg; + int len = 0; + + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + while (size > 0) { + size_t n; + size_t skip; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } + + skip = syslog_partial; + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ + syslog_idx = log_next(syslog_idx); + syslog_seq++; + syslog_prev = msg->flags; + n -= syslog_partial; + syslog_partial = 0; + } else if (!len){ + /* partial read(), remember position */ + n = size; + syslog_partial += n; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; + + if (copy_to_user(buf, text + skip, n)) { + if (!len) + len = -EFAULT; + break; + } + + len += n; + size -= n; + buf += n; + } + + kfree(text); + return len; +} + +static int syslog_print_all(char __user *buf, int size, bool clear) +{ + char *text; + int len = 0; + + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + raw_spin_lock_irq(&logbuf_lock); + if (buf) { + u64 next_seq; + u64 seq; + u32 idx; + enum log_flags prev; + + if (clear_seq < log_first_seq) { + /* messages are gone, move to first available one */ + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + prev = 0; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len += msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; + idx = log_next(idx); + seq++; + } + + /* move first record forward until length fits into the buffer */ + seq = clear_seq; + idx = clear_idx; + prev = 0; + while (len > size && seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len -= msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; + idx = log_next(idx); + seq++; + } + + /* last message fitting into this dump */ + next_seq = log_next_seq; + + len = 0; + prev = 0; + while (len >= 0 && seq < next_seq) { + struct log *msg = log_from_idx(idx); + int textlen; + + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; + prev = msg->flags; + + raw_spin_unlock_irq(&logbuf_lock); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + raw_spin_lock_irq(&logbuf_lock); + + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; + prev = 0; + } + } + } + + if (clear) { + clear_seq = log_next_seq; + clear_idx = log_next_idx; + } + raw_spin_unlock_irq(&logbuf_lock); + + kfree(text); + return len; +} + +int do_syslog(int type, char __user *buf, int len, bool from_file) +{ + bool clear = false; + static int saved_console_loglevel = -1; + int error; + + error = check_syslog_permissions(type, from_file); + if (error) + goto out; + + error = security_syslog(type); + if (error) + return error; + + switch (type) { + case SYSLOG_ACTION_CLOSE: /* Close log */ + break; + case SYSLOG_ACTION_OPEN: /* Open log */ + break; + case SYSLOG_ACTION_READ: /* Read from log */ + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + error = wait_event_interruptible(log_wait, + syslog_seq != log_next_seq); + if (error) + goto out; + error = syslog_print(buf, len); + break; + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: + clear = true; + /* FALL THRU */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + error = syslog_print_all(buf, len, clear); + break; + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: + syslog_print_all(NULL, 0, true); + break; + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: + if (saved_console_loglevel == -1) + saved_console_loglevel = console_loglevel; + console_loglevel = minimum_console_loglevel; + break; + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: + if (saved_console_loglevel != -1) { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } + break; + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: + error = -EINVAL; + if (len < 1 || len > 8) + goto out; + if (len < minimum_console_loglevel) + len = minimum_console_loglevel; + console_loglevel = len; + /* Implicitly re-enable logging to console */ + saved_console_loglevel = -1; + error = 0; + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + syslog_prev = 0; + syslog_partial = 0; + } + if (from_file) { + /* + * Short-cut for poll(/"proc/kmsg") which simply checks + * for pending data, not the size; return the count of + * records, not the length. + */ + error = log_next_idx - syslog_idx; + } else { + u64 seq = syslog_seq; + u32 idx = syslog_idx; + enum log_flags prev = syslog_prev; + + error = 0; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + error += msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + error -= syslog_partial; + } + raw_spin_unlock_irq(&logbuf_lock); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: + error = log_buf_len; + break; + default: + error = -EINVAL; + break; + } +out: + return error; +} + +SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) +{ + return do_syslog(type, buf, len, SYSLOG_FROM_READER); +} + +/* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. + * The console_lock must be held. + */ +static void call_console_drivers(int level, const char *text, size_t len) +{ + struct console *con; + + trace_console(text, len); + + if (level >= console_loglevel && !ignore_loglevel) + return; + if (!console_drivers) + return; + + for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; + if (!(con->flags & CON_ENABLED)) + continue; + if (!con->write) + continue; + if (!cpu_online(smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; + con->write(con, text, len); + } +} + +/* + * Zap console related locks when oopsing. Only zap at most once + * every 10 seconds, to leave time for slow consoles to print a + * full oops. + */ +static void zap_locks(void) +{ + static unsigned long oops_timestamp; + + if (time_after_eq(jiffies, oops_timestamp) && + !time_after(jiffies, oops_timestamp + 30 * HZ)) + return; + + oops_timestamp = jiffies; + + debug_locks_off(); + /* If a crash is occurring, make sure we can't deadlock */ + raw_spin_lock_init(&logbuf_lock); + /* And make sure that we print immediately */ + sema_init(&console_sem, 1); +} + +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_lock held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int console_trylock_for_printk(unsigned int cpu) + __releases(&logbuf_lock) +{ + int retval = 0, wake = 0; + + if (console_trylock()) { + retval = 1; + + /* + * If we can't use the console, we need to release + * the console semaphore by hand to avoid flushing + * the buffer. We need to hold the console semaphore + * in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + wake = 1; + retval = 0; + } + } + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); + if (wake) + up(&console_sem); + return retval; +} + +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(enum log_flags flags) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.flags = 0; + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) +{ + static int recursion_bug; + static char textbuf[LOG_LINE_MAX]; + char *text = textbuf; + size_t text_len; + enum log_flags lflags = 0; + unsigned long flags; + int this_cpu; + int printed_len = 0; + + boot_delay_msec(level); + printk_delay(); + + /* This stops the holder of console_sem just where we want him */ + local_irq_save(flags); + this_cpu = smp_processor_id(); + + /* + * Ouch, printk recursed into itself! + */ + if (unlikely(logbuf_cpu == this_cpu)) { + /* + * If a crash is occurring during printk() on this CPU, + * then try to get the crash message out but make sure + * we can't deadlock. Otherwise just return to avoid the + * recursion and return - but flag the recursion so that + * it can be printed at the next appropriate moment: + */ + if (!oops_in_progress && !lockdep_recursing(current)) { + recursion_bug = 1; + goto out_restore_irqs; + } + zap_locks(); + } + + lockdep_off(); + raw_spin_lock(&logbuf_lock); + logbuf_cpu = this_cpu; + + if (recursion_bug) { + static const char recursion_msg[] = + "BUG: recent printk recursion!"; + + recursion_bug = 0; + printed_len += strlen(recursion_msg); + /* emit KERN_CRIT message */ + log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, printed_len); + } + + /* + * The printf needs to come first; we need the syslog + * prefix which might be passed-in as a parameter. + */ + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + + /* mark and strip a trailing newline */ + if (text_len && text[text_len-1] == '\n') { + text_len--; + lflags |= LOG_NEWLINE; + } + + /* strip kernel syslog prefix and extract log level or control flags */ + if (facility == 0) { + int kern_level = printk_get_level(text); + + if (kern_level) { + const char *end_of_header = printk_skip_level(text); + switch (kern_level) { + case '0' ... '7': + if (level == -1) + level = kern_level - '0'; + case 'd': /* KERN_DEFAULT */ + lflags |= LOG_PREFIX; + case 'c': /* KERN_CONT */ + break; + } + text_len -= end_of_header - text; + text = (char *)end_of_header; + } + } + + if (level == -1) + level = default_message_loglevel; + + if (dict) + lflags |= LOG_PREFIX|LOG_NEWLINE; + + if (!(lflags & LOG_NEWLINE)) { + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) + cont_flush(LOG_NEWLINE); + + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); + } else { + bool stored = false; + + /* + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. + */ + if (cont.len && cont.owner == current) { + if (!(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, text_len); + cont_flush(LOG_NEWLINE); + } + + if (!stored) + log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); + } + printed_len += text_len; + + /* + * Try to acquire and then immediately release the console semaphore. + * The release will print out buffers and wake up /dev/kmsg and syslog() + * users. + * + * The console_trylock_for_printk() function will release 'logbuf_lock' + * regardless of whether it actually gets the console semaphore or not. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + + lockdep_on(); +out_restore_irqs: + local_irq_restore(flags); + + return printed_len; +} +EXPORT_SYMBOL(vprintk_emit); + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_emit(0, -1, NULL, 0, fmt, args); +} +EXPORT_SYMBOL(vprintk); + +asmlinkage int printk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_emit(facility, level, dict, dictlen, fmt, args); + va_end(args); + + return r; +} +EXPORT_SYMBOL(printk_emit); + +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the + * output and call the console drivers. If we fail to get the semaphore, we + * place the output into the log buffer and return. The current holder of + * the console_sem will notice the new output in console_unlock(); and will + * send it to the consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ +asmlinkage int printk(const char *fmt, ...) +{ + va_list args; + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + va_start(args, fmt); + r = vkdb_printf(fmt, args); + va_end(args); + return r; + } +#endif + va_start(args, fmt); + r = vprintk_emit(0, -1, NULL, 0, fmt, args); + va_end(args); + + return r; +} +EXPORT_SYMBOL(printk); + +#else /* CONFIG_PRINTK */ + +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 +#define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; +static enum log_flags console_prev; +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; +static struct log *log_from_idx(u32 idx) { return NULL; } +static u32 log_next(u32 idx) { return 0; } +static void call_console_drivers(int level, const char *text, size_t len) {} +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } + +#endif /* CONFIG_PRINTK */ + +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + +static int __add_preferred_console(char *name, int idx, char *options, + char *brl_options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + if (!brl_options) + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + if (!brl_options) + selected_console = i; + c = &console_cmdline[i]; + strlcpy(c->name, name, sizeof(c->name)); + c->options = options; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + c->brl_options = brl_options; +#endif + c->index = idx; + return 0; +} +/* + * Set up a list of consoles. Called from init/main.c + */ +static int __init console_setup(char *str) +{ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char *s, *options, *brl_options = NULL; + int idx; + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (!memcmp(str, "brl,", 4)) { + brl_options = ""; + str += 4; + } else if (!memcmp(str, "brl=", 4)) { + brl_options = str + 4; + str = strchr(brl_options, ','); + if (!str) { + printk(KERN_ERR "need port name after brl=\n"); + return 1; + } + *(str++) = 0; + } +#endif + + /* + * Decode str into name, index, options. + */ + if (str[0] >= '0' && str[0] <= '9') { + strcpy(buf, "ttyS"); + strncpy(buf + 4, str, sizeof(buf) - 5); + } else { + strncpy(buf, str, sizeof(buf) - 1); + } + buf[sizeof(buf) - 1] = 0; + if ((options = strchr(str, ',')) != NULL) + *(options++) = 0; +#ifdef __sparc__ + if (!strcmp(str, "ttya")) + strcpy(buf, "ttyS0"); + if (!strcmp(str, "ttyb")) + strcpy(buf, "ttyS1"); +#endif + for (s = buf; *s; s++) + if ((*s >= '0' && *s <= '9') || *s == ',') + break; + idx = simple_strtoul(s, NULL, 10); + *s = 0; + + __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; + return 1; +} +__setup("console=", console_setup); + +/** + * add_preferred_console - add a device to the list of preferred consoles. + * @name: device name + * @idx: device index + * @options: options for this console + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int add_preferred_console(char *name, int idx, char *options) +{ + return __add_preferred_console(name, idx, options, NULL); +} + +int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) +{ + struct console_cmdline *c; + int i; + + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + c = &console_cmdline[i]; + strlcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; + } + /* not found */ + return -1; +} + +bool console_suspend_enabled = 1; +EXPORT_SYMBOL(console_suspend_enabled); + +static int __init console_suspend_disable(char *str) +{ + console_suspend_enabled = 0; + return 1; +} +__setup("no_console_suspend", console_suspend_disable); +module_param_named(console_suspend, console_suspend_enabled, + bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(console_suspend, "suspend console during suspend" + " and hibernate operations"); + +/** + * suspend_console - suspend the console subsystem + * + * This disables printk() while we go into suspend states + */ +void suspend_console(void) +{ + if (!console_suspend_enabled) + return; + printk("Suspending console(s) (use no_console_suspend to debug)\n"); + console_lock(); + console_suspended = 1; + up(&console_sem); +} + +void resume_console(void) +{ + if (!console_suspend_enabled) + return; + down(&console_sem); + console_suspended = 0; + console_unlock(); +} + +/** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + console_lock(); + console_unlock(); + } + return NOTIFY_OK; +} + +/** + * console_lock - lock the console system for exclusive use. + * + * Acquires a lock which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * Can sleep, returns nothing. + */ +void console_lock(void) +{ + might_sleep(); + + down(&console_sem); + if (console_suspended) + return; + console_locked = 1; + console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); +} +EXPORT_SYMBOL(console_lock); + +/** + * console_trylock - try to lock the console system for exclusive use. + * + * Tried to acquire a lock which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * returns 1 on success, and 0 on failure to acquire the lock. + */ +int console_trylock(void) +{ + if (down_trylock(&console_sem)) + return 0; + if (console_suspended) { + up(&console_sem); + return 0; + } + console_locked = 1; + console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); + return 1; +} +EXPORT_SYMBOL(console_trylock); + +int is_console_locked(void) +{ + return console_locked; +} + +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} + +/** + * console_unlock - unlock the console system + * + * Releases the console_lock which the caller holds on the console system + * and the console driver list. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. + * + * If there is output waiting, we wake /dev/kmsg and syslog() users. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + static char text[LOG_LINE_MAX + PREFIX_MAX]; + static u64 seen_seq; + unsigned long flags; + bool wake_klogd = false; + bool retry; + + if (console_suspended) { + up(&console_sem); + return; + } + + console_may_schedule = 0; + + /* flush buffered message fragment immediately to console */ + console_cont_flush(text, sizeof(text)); +again: + for (;;) { + struct log *msg; + size_t len; + int level; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (seen_seq != log_next_seq) { + wake_klogd = true; + seen_seq = log_next_seq; + } + + if (console_seq < log_first_seq) { + /* messages are gone, move to first one */ + console_seq = log_first_seq; + console_idx = log_first_idx; + console_prev = 0; + } +skip: + if (console_seq == log_next_seq) + break; + + msg = log_from_idx(console_idx); + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; + goto skip; + } + + level = msg->level; + len = msg_print_text(msg, console_prev, false, + text, sizeof(text)); + console_idx = log_next(console_idx); + console_seq++; + console_prev = msg->flags; + raw_spin_unlock(&logbuf_lock); + + stop_critical_timings(); /* don't trace print latency */ + call_console_drivers(level, text, len); + start_critical_timings(); + local_irq_restore(flags); + } + console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + + raw_spin_unlock(&logbuf_lock); + + up(&console_sem); + + /* + * Someone could have filled up the buffer again, so re-check if there's + * something to flush. In case we cannot trylock the console_sem again, + * there's a new owner and the console_unlock() from them will do the + * flush, no worries. + */ + raw_spin_lock(&logbuf_lock); + retry = console_seq != log_next_seq; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + if (retry && console_trylock()) + goto again; + + if (wake_klogd) + wake_up_klogd(); +} +EXPORT_SYMBOL(console_unlock); + +/** + * console_conditional_schedule - yield the CPU if required + * + * If the console code is currently allowed to sleep, and + * if this CPU should yield the CPU to another task, do + * so here. + * + * Must be called within console_lock();. + */ +void __sched console_conditional_schedule(void) +{ + if (console_may_schedule) + cond_resched(); +} +EXPORT_SYMBOL(console_conditional_schedule); + +void console_unblank(void) +{ + struct console *c; + + /* + * console_unblank can no longer be called in interrupt context unless + * oops_in_progress is set to 1.. + */ + if (oops_in_progress) { + if (down_trylock(&console_sem) != 0) + return; + } else + console_lock(); + + console_locked = 1; + console_may_schedule = 0; + for_each_console(c) + if ((c->flags & CON_ENABLED) && c->unblank) + c->unblank(); + console_unlock(); +} + +/* + * Return the console tty driver structure and its associated index + */ +struct tty_driver *console_device(int *index) +{ + struct console *c; + struct tty_driver *driver = NULL; + + console_lock(); + for_each_console(c) { + if (!c->device) + continue; + driver = c->device(c, index); + if (driver) + break; + } + console_unlock(); + return driver; +} + +/* + * Prevent further output on the passed console device so that (for example) + * serial drivers can disable console output before suspending a port, and can + * re-enable output afterwards. + */ +void console_stop(struct console *console) +{ + console_lock(); + console->flags &= ~CON_ENABLED; + console_unlock(); +} +EXPORT_SYMBOL(console_stop); + +void console_start(struct console *console) +{ + console_lock(); + console->flags |= CON_ENABLED; + console_unlock(); +} +EXPORT_SYMBOL(console_start); + +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + +/* + * The console driver calls this routine during kernel initialization + * to register the console printing procedure with printk() and to + * print any messages that were printed by the kernel before the + * console driver was initialized. + * + * This can happen pretty early during the boot process (because of + * early_printk) - sometimes before setup_arch() completes - be careful + * of what kernel features are used - they may not be initialised yet. + * + * There are two types of consoles - bootconsoles (early_printk) and + * "real" consoles (everything which is not a bootconsole) which are + * handled differently. + * - Any number of bootconsoles can be registered at any time. + * - As soon as a "real" console is registered, all bootconsoles + * will be unregistered automatically. + * - Once a "real" console is registered, any attempt to register a + * bootconsoles will be rejected + */ +void register_console(struct console *newcon) +{ + int i; + unsigned long flags; + struct console *bcon = NULL; + + /* + * before we register a new CON_BOOT console, make sure we don't + * already have a valid console + */ + if (console_drivers && newcon->flags & CON_BOOT) { + /* find the last or real console */ + for_each_console(bcon) { + if (!(bcon->flags & CON_BOOT)) { + printk(KERN_INFO "Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } + } + } + + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + + if (preferred_console < 0 || bcon || !console_drivers) + preferred_console = selected_console; + + if (newcon->early_setup) + newcon->early_setup(); + + /* + * See if we want to use this console driver. If we + * didn't select a console we take the first one + * that registers here. + */ + if (preferred_console < 0) { + if (newcon->index < 0) + newcon->index = 0; + if (newcon->setup == NULL || + newcon->setup(newcon, NULL) == 0) { + newcon->flags |= CON_ENABLED; + if (newcon->device) { + newcon->flags |= CON_CONSDEV; + preferred_console = 0; + } + } + } + + /* + * See if this console matches one we selected on + * the command line. + */ + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; + i++) { + if (strcmp(console_cmdline[i].name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != console_cmdline[i].index) + continue; + if (newcon->index < 0) + newcon->index = console_cmdline[i].index; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console_cmdline[i].brl_options) { + newcon->flags |= CON_BRL; + braille_register_console(newcon, + console_cmdline[i].index, + console_cmdline[i].options, + console_cmdline[i].brl_options); + return; + } +#endif + if (newcon->setup && + newcon->setup(newcon, console_cmdline[i].options) != 0) + break; + newcon->flags |= CON_ENABLED; + newcon->index = console_cmdline[i].index; + if (i == selected_console) { + newcon->flags |= CON_CONSDEV; + preferred_console = selected_console; + } + break; + } + + if (!(newcon->flags & CON_ENABLED)) + return; + + /* + * If we have a bootconsole, and are switching to a real console, + * don't print everything out again, since when the boot console, and + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + newcon->flags &= ~CON_PRINTBUFFER; + + /* + * Put this console in the list - keep the + * preferred driver at the head of the list. + */ + console_lock(); + if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { + newcon->next = console_drivers; + console_drivers = newcon; + if (newcon->next) + newcon->next->flags &= ~CON_CONSDEV; + } else { + newcon->next = console_drivers->next; + console_drivers->next = newcon; + } + if (newcon->flags & CON_PRINTBUFFER) { + /* + * console_unlock(); will print out the buffered messages + * for us. + */ + raw_spin_lock_irqsave(&logbuf_lock, flags); + console_seq = syslog_seq; + console_idx = syslog_idx; + console_prev = syslog_prev; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; + } + console_unlock(); + console_sysfs_notify(); + + /* + * By unregistering the bootconsoles after we enable the real console + * we get the "console xxx enabled" message on all the consoles - + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) + */ + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); + for_each_console(bcon) + if (bcon->flags & CON_BOOT) + unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); + } +} +EXPORT_SYMBOL(register_console); + +int unregister_console(struct console *console) +{ + struct console *a, *b; + int res = 1; + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console->flags & CON_BRL) + return braille_unregister_console(console); +#endif + + console_lock(); + if (console_drivers == console) { + console_drivers=console->next; + res = 0; + } else if (console_drivers) { + for (a=console_drivers->next, b=console_drivers ; + a; b=a, a=b->next) { + if (a == console) { + b->next = a->next; + res = 0; + break; + } + } + } + + /* + * If this isn't the last console and it has CON_CONSDEV set, we + * need to set it on the next preferred console. + */ + if (console_drivers != NULL && console->flags & CON_CONSDEV) + console_drivers->flags |= CON_CONSDEV; + + console_unlock(); + console_sysfs_notify(); + return res; +} +EXPORT_SYMBOL(unregister_console); + +static int __init printk_late_init(void) +{ + struct console *con; + + for_each_console(con) { + if (!keep_bootcon && con->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + con->name, con->index); + unregister_console(con); + } + } + hotcpu_notifier(console_cpu_notify, 0); + return 0; +} +late_initcall(printk_late_init); + +#if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} + +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + local_irq_restore(flags); + + return r; +} + +/* + * printk rate limiting, lifted from the networking subsystem. + * + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. + */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + +int __printk_ratelimit(const char *func) +{ + return ___ratelimit(&printk_ratelimit_state, func); +} +EXPORT_SYMBOL(__printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 + || !time_in_range(jiffies, *caller_jiffies, + *caller_jiffies + + msecs_to_jiffies(interval_msecs))) { + *caller_jiffies = jiffies; + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); + +static DEFINE_SPINLOCK(dump_list_lock); +static LIST_HEAD(dump_list); + +/** + * kmsg_dump_register - register a kernel log dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Adds a kernel log dumper to the system. The dump callback in the + * structure will be called when the kernel oopses or panics and must be + * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. + */ +int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EBUSY; + + /* The dump callback needs to be set */ + if (!dumper->dump) + return -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + /* Don't allow registering multiple times */ + if (!dumper->registered) { + dumper->registered = 1; + list_add_tail_rcu(&dumper->list, &dump_list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_register); + +/** + * kmsg_dump_unregister - unregister a kmsg dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Removes a dump device from the system. Returns zero on success and + * %-EINVAL otherwise. + */ +int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + if (dumper->registered) { + dumper->registered = 0; + list_del_rcu(&dumper->list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + synchronize_rcu(); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_unregister); + +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + +/** + * kmsg_dump - dump kernel log to kernel message dumpers. + * @reason: the reason (oops, panic etc) for dumping + * + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). + */ +void kmsg_dump(enum kmsg_dump_reason reason) +{ + struct kmsg_dumper *dumper; + unsigned long flags; + + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + * + * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= log_next_seq) + goto out; + + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, 0, syslog, line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; +out: + if (len) + *len = l; + return ret; +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @buf: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + enum log_flags prev; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + prev = 0; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l -= msg_print_text(msg, prev, true, NULL, 0); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + prev = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, prev, syslog, buf + l, size - l); + idx = log_next(idx); + seq++; + prev = msg->flags; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + +/** + * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + * + * The function is similar to kmsg_dump_rewind(), but grabs no locks. + */ +void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +{ + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; +} + +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + kmsg_dump_rewind_nolock(dumper); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +static char dump_stack_arch_desc_str[128]; + +/** + * dump_stack_set_arch_desc - set arch-specific str to show with task dumps + * @fmt: printf-style format string + * @...: arguments for the format string + * + * The configured string will be printed right after utsname during task + * dumps. Usually used to add arch-specific system identifiers. If an + * arch wants to make use of such an ID string, it should initialize this + * as soon as possible during boot. + */ +void __init dump_stack_set_arch_desc(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), + fmt, args); + va_end(args); +} + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + if (dump_stack_arch_desc_str[0] != '\0') + printk("%sHardware name: %s\n", + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); +} + +/** + * show_regs_print_info - print generic debug info for show_regs() + * @log_lvl: log level + * + * show_regs() implementations can use this function to print out generic + * debug information. + */ +void show_regs_print_info(const char *log_lvl) +{ + dump_stack_print_info(log_lvl); + + printk("%stask: %p ti: %p task.ti: %p\n", + log_lvl, current, current_thread_info(), + task_thread_info(current)); +} + +#endif -- cgit v1.2.3 From d197c43d04decb6b1298fa3ef26ea04a9ca7c977 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 13:53:44 -0700 Subject: printk: add console_cmdline.h Add an include file for the console_cmdline struct so that the braille console driver can be separated. Signed-off-by: Joe Perches Cc: Samuel Thibault Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/console_cmdline.h | 14 ++++++++++++++ kernel/printk/printk.c | 12 +++--------- 2 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 kernel/printk/console_cmdline.h (limited to 'kernel') diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h new file mode 100644 index 000000000000..cbd69d842341 --- /dev/null +++ b/kernel/printk/console_cmdline.h @@ -0,0 +1,14 @@ +#ifndef _CONSOLE_CMDLINE_H +#define _CONSOLE_CMDLINE_H + +struct console_cmdline +{ + char name[8]; /* Name of the driver */ + int index; /* Minor dev. to use */ + char *options; /* Options for the driver */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + char *brl_options; /* Options for braille driver */ +#endif +}; + +#endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 69b0890ed7e5..4da2b2c7f67d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -51,6 +51,8 @@ #define CREATE_TRACE_POINTS #include +#include "console_cmdline.h" + /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -105,19 +107,11 @@ static struct console *exclusive_console; /* * Array of consoles built from command line options (console=) */ -struct console_cmdline -{ - char name[8]; /* Name of the driver */ - int index; /* Minor dev. to use */ - char *options; /* Options for the driver */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - char *brl_options; /* Options for braille driver */ -#endif -}; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; + static int selected_console = -1; static int preferred_console = -1; int console_set_on_cmdline; -- cgit v1.2.3 From bbeddf52adc1b4207674ab88686cbbe58c24f721 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 13:53:45 -0700 Subject: printk: move braille console support into separate braille.[ch] files Create files with prototypes and static inlines for braille support. Make braille_console functions return 1 on success. Corrected CONFIG_A11Y_BRAILLE_CONSOLE=n _braille_console_setup return value to NULL. Signed-off-by: Joe Perches Reviewed-by: Samuel Thibault Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 1 + kernel/printk/braille.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/braille.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 44 +++++++++++++------------------------------- 4 files changed, 110 insertions(+), 31 deletions(-) create mode 100644 kernel/printk/braille.c create mode 100644 kernel/printk/braille.h (limited to 'kernel') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 36d306d9273c..85405bdcf2b3 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1 +1,2 @@ obj-y = printk.o +obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c new file mode 100644 index 000000000000..b51087fb9ace --- /dev/null +++ b/kernel/printk/braille.c @@ -0,0 +1,48 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +#include "console_cmdline.h" +#include "braille.h" + +char *_braille_console_setup(char **str, char **brl_options) +{ + if (!memcmp(*str, "brl,", 4)) { + *brl_options = ""; + *str += 4; + } else if (!memcmp(str, "brl=", 4)) { + *brl_options = *str + 4; + *str = strchr(*brl_options, ','); + if (!*str) + pr_err("need port name after brl=\n"); + else + *((*str)++) = 0; + } + + return *str; +} + +int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ + int rtn = 0; + + if (c->brl_options) { + console->flags |= CON_BRL; + rtn = braille_register_console(console, c->index, c->options, + c->brl_options); + } + + return rtn; +} + +int +_braille_unregister_console(struct console *console) +{ + if (console->flags & CON_BRL) + return braille_unregister_console(console); + + return 0; +} diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h new file mode 100644 index 000000000000..769d771145c8 --- /dev/null +++ b/kernel/printk/braille.h @@ -0,0 +1,48 @@ +#ifndef _PRINTK_BRAILLE_H +#define _PRINTK_BRAILLE_H + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ + c->brl_options = brl_options; +} + +char * +_braille_console_setup(char **str, char **brl_options); + +int +_braille_register_console(struct console *console, struct console_cmdline *c); + +int +_braille_unregister_console(struct console *console); + +#else + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ +} + +static inline char * +_braille_console_setup(char **str, char **brl_options) +{ + return NULL; +} + +static inline int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ + return 0; +} + +static inline int +_braille_unregister_console(struct console *console) +{ + return 0; +} + +#endif + +#endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4da2b2c7f67d..5a022e0c654c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -52,6 +52,7 @@ #include #include "console_cmdline.h" +#include "braille.h" /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -1769,9 +1770,8 @@ static int __add_preferred_console(char *name, int idx, char *options, c = &console_cmdline[i]; strlcpy(c->name, name, sizeof(c->name)); c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - c->brl_options = brl_options; -#endif + braille_set_options(c, brl_options); + c->index = idx; return 0; } @@ -1784,20 +1784,8 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (!memcmp(str, "brl,", 4)) { - brl_options = ""; - str += 4; - } else if (!memcmp(str, "brl=", 4)) { - brl_options = str + 4; - str = strchr(brl_options, ','); - if (!str) { - printk(KERN_ERR "need port name after brl=\n"); - return 1; - } - *(str++) = 0; - } -#endif + if (_braille_console_setup(&str, &brl_options)) + return 1; /* * Decode str into name, index, options. @@ -2291,16 +2279,10 @@ void register_console(struct console *newcon) continue; if (newcon->index < 0) newcon->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console_cmdline[i].brl_options) { - newcon->flags |= CON_BRL; - braille_register_console(newcon, - console_cmdline[i].index, - console_cmdline[i].options, - console_cmdline[i].brl_options); + + if (_braille_register_console(newcon, &console_cmdline[i])) return; - } -#endif + if (newcon->setup && newcon->setup(newcon, console_cmdline[i].options) != 0) break; @@ -2388,13 +2370,13 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { struct console *a, *b; - int res = 1; + int res; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console->flags & CON_BRL) - return braille_unregister_console(console); -#endif + res = _braille_unregister_console(console); + if (res) + return res; + res = 1; console_lock(); if (console_drivers == console) { console_drivers=console->next; -- cgit v1.2.3 From 23475408c618ecd5b44b7e069fd65ec73d17d9f0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 13:53:46 -0700 Subject: printk: use pointer for console_cmdline indexing Make the code a bit more compact by always using a pointer for the active console_cmdline. Move overly indented code to correct indent level. Signed-off-by: Joe Perches Cc: Samuel Thibault Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5a022e0c654c..8f1fb50aa3ce 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1756,18 +1756,19 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - if (!brl_options) - selected_console = i; - return 0; + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (strcmp(c->name, name) == 0 && c->index == idx) { + if (!brl_options) + selected_console = i; + return 0; } + } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) selected_console = i; - c = &console_cmdline[i]; strlcpy(c->name, name, sizeof(c->name)); c->options = options; braille_set_options(c, brl_options); @@ -1840,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha struct console_cmdline *c; int i; - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - c = &console_cmdline[i]; - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) + if (strcmp(c->name, name) == 0 && c->index == idx) { + strlcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; } /* not found */ return -1; @@ -2223,6 +2224,7 @@ void register_console(struct console *newcon) int i; unsigned long flags; struct console *bcon = NULL; + struct console_cmdline *c; /* * before we register a new CON_BOOT console, make sure we don't @@ -2270,24 +2272,25 @@ void register_console(struct console *newcon) * See if this console matches one we selected on * the command line. */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; - i++) { - if (strcmp(console_cmdline[i].name, newcon->name) != 0) + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && - newcon->index != console_cmdline[i].index) + newcon->index != c->index) continue; if (newcon->index < 0) - newcon->index = console_cmdline[i].index; + newcon->index = c->index; - if (_braille_register_console(newcon, &console_cmdline[i])) + if (_braille_register_console(newcon, c)) return; if (newcon->setup && newcon->setup(newcon, console_cmdline[i].options) != 0) break; newcon->flags |= CON_ENABLED; - newcon->index = console_cmdline[i].index; + newcon->index = c->index; if (i == selected_console) { newcon->flags |= CON_CONSDEV; preferred_console = selected_console; -- cgit v1.2.3 From 62e32ac3505a0cab1c5ef8ea2c0eab3b26ed855f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 31 Jul 2013 13:53:47 -0700 Subject: printk: rename struct log to struct printk_log Rename the struct to enable moving portions of printk.c to separate files. The rename changes output of /proc/vmcoreinfo. Signed-off-by: Joe Perches Cc: Samuel Thibault Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 80 +++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8f1fb50aa3ce..5b5a7080e2a5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ static int console_may_schedule; * 67 "g" * 0032 00 00 00 padding to next message header * - * The 'struct log' buffer header must never be directly exported to + * The 'struct printk_log' buffer header must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -195,7 +195,7 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct log { +struct printk_log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ @@ -243,7 +243,7 @@ static u32 clear_idx; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN __alignof__(struct log) +#define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -254,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN; static volatile unsigned int logbuf_cpu = UINT_MAX; /* human readable text of the record */ -static char *log_text(const struct log *msg) +static char *log_text(const struct printk_log *msg) { - return (char *)msg + sizeof(struct log); + return (char *)msg + sizeof(struct printk_log); } /* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct log *msg) +static char *log_dict(const struct printk_log *msg) { - return (char *)msg + sizeof(struct log) + msg->text_len; + return (char *)msg + sizeof(struct printk_log) + msg->text_len; } /* get record by index; idx must point to valid msg */ -static struct log *log_from_idx(u32 idx) +static struct printk_log *log_from_idx(u32 idx) { - struct log *msg = (struct log *)(log_buf + idx); + struct printk_log *msg = (struct printk_log *)(log_buf + idx); /* * A length == 0 record is the end of buffer marker. Wrap around and * read the message at the start of the buffer. */ if (!msg->len) - return (struct log *)log_buf; + return (struct printk_log *)log_buf; return msg; } /* get next record; idx must point to valid msg */ static u32 log_next(u32 idx) { - struct log *msg = (struct log *)(log_buf + idx); + struct printk_log *msg = (struct printk_log *)(log_buf + idx); /* length == 0 indicates the end of the buffer; wrap */ /* @@ -291,7 +291,7 @@ static u32 log_next(u32 idx) * return the one after that. */ if (!msg->len) { - msg = (struct log *)log_buf; + msg = (struct printk_log *)log_buf; return msg->len; } return idx + msg->len; @@ -303,11 +303,11 @@ static void log_store(int facility, int level, const char *dict, u16 dict_len, const char *text, u16 text_len) { - struct log *msg; + struct printk_log *msg; u32 size, pad_len; /* number of '\0' padding bytes to next message */ - size = sizeof(struct log) + text_len + dict_len; + size = sizeof(struct printk_log) + text_len + dict_len; pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; @@ -319,7 +319,7 @@ static void log_store(int facility, int level, else free = log_first_idx - log_next_idx; - if (free > size + sizeof(struct log)) + if (free > size + sizeof(struct printk_log)) break; /* drop old messages until we have enough contiuous space */ @@ -327,18 +327,18 @@ static void log_store(int facility, int level, log_first_seq++; } - if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 * to signify a wrap around. */ - memset(log_buf + log_next_idx, 0, sizeof(struct log)); + memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); log_next_idx = 0; } /* fill message */ - msg = (struct log *)(log_buf + log_next_idx); + msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); @@ -351,7 +351,7 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; /* insert message */ log_next_idx += msg->len; @@ -474,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct log *msg; + struct printk_log *msg; u64 ts_usec; size_t i; char cont = '-'; @@ -719,14 +719,14 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); /* - * Export struct log size and field offsets. User space tools can + * Export struct printk_log size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(log); - VMCOREINFO_OFFSET(log, ts_nsec); - VMCOREINFO_OFFSET(log, len); - VMCOREINFO_OFFSET(log, text_len); - VMCOREINFO_OFFSET(log, dict_len); + VMCOREINFO_STRUCT_SIZE(printk_log); + VMCOREINFO_OFFSET(printk_log, ts_nsec); + VMCOREINFO_OFFSET(printk_log, len); + VMCOREINFO_OFFSET(printk_log, text_len); + VMCOREINFO_OFFSET(printk_log, dict_len); } #endif @@ -879,7 +879,7 @@ static size_t print_time(u64 ts, char *buf) (unsigned long)ts, rem_nsec / 1000); } -static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) { size_t len = 0; unsigned int prefix = (msg->facility << 3) | msg->level; @@ -902,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { const char *text = log_text(msg); @@ -964,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, static int syslog_print(char __user *buf, int size) { char *text; - struct log *msg; + struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1055,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = clear_idx; prev = 0; while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); len += msg_print_text(msg, prev, true, NULL, 0); prev = msg->flags; @@ -1068,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = clear_idx; prev = 0; while (len > size && seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); len -= msg_print_text(msg, prev, true, NULL, 0); prev = msg->flags; @@ -1082,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len = 0; prev = 0; while (len >= 0 && seq < next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); int textlen; textlen = msg_print_text(msg, prev, true, text, @@ -1228,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = 0; while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -1714,10 +1714,10 @@ static struct cont { u8 level; bool flushed:1; } cont; -static struct log *log_from_idx(u32 idx) { return NULL; } +static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -2029,7 +2029,7 @@ void console_unlock(void) console_cont_flush(text, sizeof(text)); again: for (;;) { - struct log *msg; + struct printk_log *msg; size_t len; int level; @@ -2645,7 +2645,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct log *msg; + struct printk_log *msg; size_t l = 0; bool ret = false; @@ -2757,7 +2757,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, idx = dumper->cur_idx; prev = 0; while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -2770,7 +2770,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, idx = dumper->cur_idx; prev = 0; while (l > size && seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -2785,7 +2785,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, l = 0; prev = 0; while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l += msg_print_text(msg, prev, syslog, buf + l, size - l); idx = log_next(idx); -- cgit v1.2.3 From 40c32592668b727cbfcf7b1c0567f581bd62a5e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 3 Jul 2013 23:33:50 -0400 Subject: tracing/kprobes: Fail to unregister if probe event files are in use When a probe is being removed, it cleans up the event files that correspond to the probe. But there is a race between writing to one of these files and deleting the probe. This is especially true for the "enable" file. CPU 0 CPU 1 ----- ----- fd = open("enable",O_WRONLY); probes_open() release_all_trace_probes() unregister_trace_probe() if (trace_probe_is_enabled(tp)) return -EBUSY write(fd, "1", 1) __ftrace_set_clr_event() call->class->reg() (kprobe_register) enable_trace_probe(tp) __unregister_trace_probe(tp); list_del(&tp->list) unregister_probe_event(tp) <-- fails! free_trace_probe(tp) write(fd, "0", 1) __ftrace_set_clr_event() call->class->unreg (kprobe_register) disable_trace_probe(tp) <-- BOOM! A test program was written that used two threads to simulate the above scenario adding a nanosleep() interval to change the timings and after several thousand runs, it was able to trigger this bug and crash: BUG: unable to handle kernel paging request at 00000005000000f9 IP: [] probes_open+0x3b/0xa7 PGD 7808a067 PUD 0 Oops: 0000 [#1] PREEMPT SMP Dumping ftrace buffer: --------------------------------- Modules linked in: ipt_MASQUERADE sunrpc ip6t_REJECT nf_conntrack_ipv6 CPU: 1 PID: 2070 Comm: test-kprobe-rem Not tainted 3.11.0-rc3-test+ #47 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 task: ffff880077756440 ti: ffff880076e52000 task.ti: ffff880076e52000 RIP: 0010:[] [] probes_open+0x3b/0xa7 RSP: 0018:ffff880076e53c38 EFLAGS: 00010203 RAX: 0000000500000001 RBX: ffff88007844f440 RCX: 0000000000000003 RDX: 0000000000000003 RSI: 0000000000000003 RDI: ffff880076e52000 RBP: ffff880076e53c58 R08: ffff880076e53bd8 R09: 0000000000000000 R10: ffff880077756440 R11: 0000000000000006 R12: ffffffff810dee35 R13: ffff880079250418 R14: 0000000000000000 R15: ffff88007844f450 FS: 00007f87a276f700(0000) GS:ffff88007d480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000005000000f9 CR3: 0000000077262000 CR4: 00000000000007e0 Stack: ffff880076e53c58 ffffffff81219ea0 ffff88007844f440 ffffffff810dee35 ffff880076e53ca8 ffffffff81130f78 ffff8800772986c0 ffff8800796f93a0 ffffffff81d1b5d8 ffff880076e53e04 0000000000000000 ffff88007844f440 Call Trace: [] ? security_file_open+0x2c/0x30 [] ? unregister_trace_probe+0x4b/0x4b [] do_dentry_open+0x162/0x226 [] finish_open+0x46/0x54 [] do_last+0x7f6/0x996 [] ? inode_permission+0x42/0x44 [] path_openat+0x232/0x496 [] do_filp_open+0x3a/0x8a [] ? __alloc_fd+0x168/0x17a [] do_sys_open+0x70/0x102 [] ? trace_hardirqs_on_caller+0x160/0x197 [] SyS_open+0x1e/0x20 [] system_call_fastpath+0x16/0x1b Code: e5 41 54 53 48 89 f3 48 83 ec 10 48 23 56 78 48 39 c2 75 6c 31 f6 48 c7 RIP [] probes_open+0x3b/0xa7 RSP CR2: 00000005000000f9 ---[ end trace 35f17d68fc569897 ]--- The unregister_trace_probe() must be done first, and if it fails it must fail the removal of the kprobe. Several changes have already been made by Oleg Nesterov and Masami Hiramatsu to allow moving the unregister_probe_event() before the removal of the probe and exit the function if it fails. This prevents the tp structure from being used after it is freed. Link: http://lkml.kernel.org/r/20130704034038.819592356@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3811487e7a7a..243f6834d026 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -95,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) } static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); +static int unregister_probe_event(struct trace_probe *tp); static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); @@ -351,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp) if (trace_probe_is_enabled(tp)) return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_probe_event(tp)) + return -EBUSY; + __unregister_trace_probe(tp); list_del(&tp->list); - unregister_probe_event(tp); return 0; } @@ -632,7 +635,9 @@ static int release_all_trace_probes(void) /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { tp = list_entry(probe_list.next, struct trace_probe, list); - unregister_trace_probe(tp); + ret = unregister_trace_probe(tp); + if (ret) + goto end; free_trace_probe(tp); } @@ -1247,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp) return ret; } -static void unregister_probe_event(struct trace_probe *tp) +static int unregister_probe_event(struct trace_probe *tp) { + int ret; + /* tp->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tp->call); - kfree(tp->call.print_fmt); + ret = trace_remove_event_call(&tp->call); + if (!ret) + kfree(tp->call.print_fmt); + return ret; } /* Make a debugfs interface for controlling probe points */ -- cgit v1.2.3 From 2865a8fb44cc32420407362cbda80c10fa09c6b2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 1 Aug 2013 09:56:36 +0800 Subject: workqueue: copy workqueue_attrs with all fields $echo '0' > /sys/bus/workqueue/devices/xxx/numa $cat /sys/bus/workqueue/devices/xxx/numa I got 1. It should be 0, the reason is copy_workqueue_attrs() called in apply_workqueue_attrs() doesn't copy no_numa field. Fix it by making copy_workqueue_attrs() copy ->no_numa too. This would also make get_unbound_pool() set a pool's ->no_numa attribute according to the workqueue attributes used when the pool was created. While harmelss, as ->no_numa isn't a pool attribute, this is a bit confusing. Clear it explicitly. tj: Updated description and comments a bit. Signed-off-by: Shaohua Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 55f5f0afcd0d..726adc84b3ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3416,6 +3416,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + /* + * Unlike hash and equality test, this function doesn't ignore + * ->no_numa as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->no_numa after copying. + */ + to->no_numa = from->no_numa; } /* hash value of the content of @attr */ @@ -3583,6 +3589,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + /* + * no_numa isn't a worker_pool attribute, always clear it. See + * 'struct workqueue_attrs' comments for detail. + */ + pool->attrs->no_numa = false; + /* if cpumask is contained inside a NUMA node, we belong to that node */ if (wq_numa_enabled) { for_each_node(node) { -- cgit v1.2.3 From c6c2401d8bbaf9edc189b4c35a8cb2780b8b988e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 3 Jul 2013 23:33:51 -0400 Subject: tracing/uprobes: Fail to unregister if probe event files are in use Uprobes suffer the same problem that kprobes have. There's a race between writing to the "enable" file and removing the probe. The probe checks for it being in use and if it is not, goes about deleting the probe and the event that represents it. But the problem with that is, after it checks if it is in use it can be enabled, and the deletion of the event (access to the probe) will fail, as it is in use. But the uprobe will still be deleted. This is a problem as the event can reference the uprobe that was deleted. The fix is to remove the event first, and check to make sure the event removal succeeds. Then it is safe to remove the probe. When the event exists, either ftrace or perf can enable the probe and prevent the event from being removed. Link: http://lkml.kernel.org/r/20130704034038.991525256@goodmis.org Acked-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 51 +++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a23d2d71188e..272261b5f94f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -70,7 +70,7 @@ struct trace_uprobe { (sizeof(struct probe_arg) * (n))) static int register_uprobe_event(struct trace_uprobe *tu); -static void unregister_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); @@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou } /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ -static void unregister_trace_uprobe(struct trace_uprobe *tu) +static int unregister_trace_uprobe(struct trace_uprobe *tu) { + int ret; + + ret = unregister_uprobe_event(tu); + if (ret) + return ret; + list_del(&tu->list); - unregister_uprobe_event(tu); free_trace_uprobe(tu); + return 0; } /* Register a trace_uprobe and probe_event */ @@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) /* register as an event */ old_tp = find_probe_event(tu->call.name, tu->call.class->system); - if (old_tp) + if (old_tp) { /* delete old event */ - unregister_trace_uprobe(old_tp); + ret = unregister_trace_uprobe(old_tp); + if (ret) + goto end; + } ret = register_uprobe_event(tu); if (ret) { @@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv) group = UPROBE_EVENT_SYSTEM; if (is_delete) { + int ret; + if (!event) { pr_info("Delete command needs an event name.\n"); return -EINVAL; @@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv) return -ENOENT; } /* delete an event */ - unregister_trace_uprobe(tu); + ret = unregister_trace_uprobe(tu); mutex_unlock(&uprobe_lock); - return 0; + return ret; } if (argc < 2) { @@ -408,16 +419,20 @@ fail_address_parse: return ret; } -static void cleanup_all_probes(void) +static int cleanup_all_probes(void) { struct trace_uprobe *tu; + int ret = 0; mutex_lock(&uprobe_lock); while (!list_empty(&uprobe_list)) { tu = list_entry(uprobe_list.next, struct trace_uprobe, list); - unregister_trace_uprobe(tu); + ret = unregister_trace_uprobe(tu); + if (ret) + break; } mutex_unlock(&uprobe_lock); + return ret; } /* Probes listing interfaces */ @@ -462,8 +477,13 @@ static const struct seq_operations probes_seq_op = { static int probes_open(struct inode *inode, struct file *file) { - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - cleanup_all_probes(); + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = cleanup_all_probes(); + if (ret) + return ret; + } return seq_open(file, &probes_seq_op); } @@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu) return ret; } -static void unregister_uprobe_event(struct trace_uprobe *tu) +static int unregister_uprobe_event(struct trace_uprobe *tu) { + int ret; + /* tu->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tu->call); + ret = trace_remove_event_call(&tu->call); + if (ret) + return ret; kfree(tu->call.print_fmt); tu->call.print_fmt = NULL; + return 0; } /* Make a trace interface for controling probe points */ -- cgit v1.2.3 From ed5467da0e369e65b247b99eb6403cb79172bcda Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Fri, 2 Aug 2013 21:16:43 +0400 Subject: tracing: Fix fields of struct trace_iterator that are zeroed by mistake tracing_read_pipe zeros all fields bellow "seq". The declaration contains a comment about that, but it doesn't help. The first field is "snapshot", it's true when current open file is snapshot. Looks obvious, that it should not be zeroed. The second field is "started". It was converted from cpumask_t to cpumask_var_t (v2.6.28-4983-g4462344), in other words it was converted from cpumask to pointer on cpumask. Currently the reference on "started" memory is lost after the first read from tracing_read_pipe and a proper object will never be freed. The "started" is never dereferenced for trace_pipe, because trace_pipe can't have the TRACE_FILE_ANNOTATE options. Link: http://lkml.kernel.org/r/1375463803-3085183-1-git-send-email-avagin@openvz.org Cc: stable@vger.kernel.org # 2.6.30 Signed-off-by: Andrew Vagin Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 882ec1dd1515..f5b35a5e852f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4151,6 +4151,7 @@ waitagain: memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); + cpumask_clear(iter->started); iter->pos = -1; trace_event_read_lock(); -- cgit v1.2.3 From 711e124379e0f889e40e2f01d7f5d61936d3cd23 Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Fri, 2 Aug 2013 18:36:15 -0700 Subject: tracing: Make TRACE_ITER_STOP_ON_FREE stop the correct buffer Releasing the free_buffer file in an instance causes the global buffer to be stopped when TRACE_ITER_STOP_ON_FREE is enabled. Operate on the correct buffer. Link: http://lkml.kernel.org/r/1375493777-17261-1-git-send-email-azl@google.com Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5b35a5e852f..531c9e69d0b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4469,7 +4469,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) - tracing_off(); + tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); -- cgit v1.2.3 From 9457158bbc0ee04ecef76862d73eecd8076e9c7b Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Fri, 2 Aug 2013 18:36:16 -0700 Subject: tracing: Fix reset of time stamps during trace_clock changes Fixed two issues with changing the timestamp clock with trace_clock: - The global buffer was reset on instance clock changes. Change this to pass the correct per-instance buffer - ftrace_now() is used to set buf->time_start in tracing_reset_online_cpus(). This was incorrect because ftrace_now() used the global buffer's clock to return the current time. Change this to use buffer_ftrace_now() which returns the current time for the correct per-instance buffer. Also removed tracing_reset_current() because it is not used anywhere Link: http://lkml.kernel.org/r/1375493777-17261-2-git-send-email-azl@google.com Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Cc: stable@vger.kernel.org # 3.10 Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 531c9e69d0b3..496f94d57698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -243,20 +243,25 @@ int filter_current_check_discard(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.trace_buffer.buffer) + if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(buf->buffer, cpu); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } +cycle_t ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +} + /** * tracing_is_enabled - Show if global_trace has been disabled * @@ -1211,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) /* Make sure all commits have finished */ synchronize_sched(); - buf->time_start = ftrace_now(buf->cpu); + buf->time_start = buffer_ftrace_now(buf, buf->cpu); for_each_online_cpu(cpu) ring_buffer_reset_cpu(buffer, cpu); @@ -1219,11 +1224,6 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_enable(buffer); } -void tracing_reset_current(int cpu) -{ - tracing_reset(&global_trace.trace_buffer, cpu); -} - /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus(void) { @@ -4634,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace.trace_buffer); + tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); - tracing_reset_online_cpus(&global_trace.max_buffer); + tracing_reset_online_cpus(&tr->max_buffer); #endif mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 6160968cee8b90a5dd95318d716e31d7775c4ef3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Aug 2013 19:38:55 +0200 Subject: userns: unshare_userns(&cred) should not populate cred on failure unshare_userns(new_cred) does *new_cred = prepare_creds() before create_user_ns() which can fail. However, the caller expects that it doesn't need to take care of new_cred if unshare_userns() fails. We could change the single caller, sys_unshare(), but I think it would be more clean to avoid the side effects on failure, so with this patch unshare_userns() does put_cred() itself and initializes *new_cred only if create_user_ns() succeeeds. Cc: stable@vger.kernel.org Signed-off-by: Oleg Nesterov Reviewed-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d8c30db06c5b..6e50a44610ee 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -105,16 +105,21 @@ int create_user_ns(struct cred *new) int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; + int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); - if (!cred) - return -ENOMEM; + if (cred) { + err = create_user_ns(cred); + if (err) + put_cred(cred); + else + *new_cred = cred; + } - *new_cred = cred; - return create_user_ns(cred); + return err; } void free_user_ns(struct user_namespace *ns) -- cgit v1.2.3 From 35114fcbe0b9b0fa3f6653a2a8e4c6b8a9f8cc2d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Aug 2013 17:43:37 +0200 Subject: Revert "ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child)" This reverts commit fab840fc2d542fabcab903db8e03589a6702ba5f. This commit even has the test-case to prove that the tracee can be killed by SIGTRAP if the debugger does not remove the breakpoints before PTRACE_DETACH. However, this is exactly what wineserver deliberately does, set_thread_context() calls PTRACE_ATTACH + PTRACE_DETACH just for PTRACE_POKEUSER(DR*) in between. So we should revert this fix and document that PTRACE_DETACH should keep the breakpoints. Reported-by: Felipe Contreras Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4041f5747e73..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,7 +469,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* -- cgit v1.2.3 From 2cfe6c4ac7ee0193780d655c5dea5a73acae1f46 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 Aug 2013 22:55:28 -0400 Subject: printk: Fix return of braille_register_console() Some of my configs I test with have CONFIG_A11Y_BRAILLE_CONSOLE set. When I started testing against v3.11-rc4 my console went bonkers. Using ktest to bisect the issue, it came down to: commit bbeddf52a "printk: move braille console support into separate braille.[ch] files" Looking into the patch I found the problem. It's with the return of braille_register_console(). As anything other than NULL is considered a failure. But for those of us that have CONFIG_A11Y_BRAILLE_CONSOLE set but do not define a "brl" or "brl=" on the command line, we still may want a console that those with sight can still use. Return NULL (success) if "brl" or "brl=" is not on the console line. Signed-off-by: Steven Rostedt Acked-by: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/braille.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index b51087fb9ace..276762f3a460 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -19,7 +19,8 @@ char *_braille_console_setup(char **str, char **brl_options) pr_err("need port name after brl=\n"); else *((*str)++) = 0; - } + } else + return NULL; return *str; } -- cgit v1.2.3 From 8742f229b635bf1c1c84a3dfe5e47c814c20b5c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Aug 2013 18:55:32 +0200 Subject: userns: limit the maximum depth of user_namespace->parent chain Ensure that user_namespace->parent chain can't grow too much. Currently we use the hardroded 32 as limit. Reported-by: Andy Lutomirski Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6e50a44610ee..9064b919a406 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -62,6 +62,9 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + if (parent_ns->level > 32) + return -EUSERS; + /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, @@ -92,6 +95,7 @@ int create_user_ns(struct cred *new) atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; + ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; -- cgit v1.2.3 From e0acd0a68ec7dbf6b7a81a87a867ebd7ac9b76c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 Aug 2013 18:14:00 +0200 Subject: sched: fix the theoretical signal_wake_up() vs schedule() race This is only theoretical, but after try_to_wake_up(p) was changed to check p->state under p->pi_lock the code like __set_current_state(TASK_INTERRUPTIBLE); schedule(); can miss a signal. This is the special case of wait-for-condition, it relies on try_to_wake_up/schedule interaction and thus it does not need mb() between __set_current_state() and if(signal_pending). However, this __set_current_state() can move into the critical section protected by rq->lock, now that try_to_wake_up() takes another lock we need to ensure that it can't be reordered with "if (signal_pending(current))" check inside that section. The patch is actually one-liner, it simply adds smp_wmb() before spin_lock_irq(rq->lock). This is what try_to_wake_up() already does by the same reason. We turn this wmb() into the new helper, smp_mb__before_spinlock(), for better documentation and to allow the architectures to change the default implementation. While at it, kill smp_mb__after_lock(), it has no callers. Perhaps we can also add smp_mb__before/after_spinunlock() for prepare_to_wait(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..ef51b0ef4bdc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1491,7 +1491,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; - smp_wmb(); + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. + */ + smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; @@ -2394,6 +2400,12 @@ need_resched: if (sched_feat(HRTICK)) hrtick_clear(rq); + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; -- cgit v1.2.3 From dfa9771a7c4784bafd0673bc7abcee3813088b77 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 13 Aug 2013 16:00:53 -0700 Subject: microblaze: fix clone syscall Fix inadvertent breakage in the clone syscall ABI for Microblaze that was introduced in commit f3268edbe6fe ("microblaze: switch to generic fork/vfork/clone"). The Microblaze syscall ABI for clone takes the parent tid address in the 4th argument; the third argument slot is used for the stack size. The incorrectly-used CLONE_BACKWARDS type assigned parent tid to the 3rd slot. This commit restores the original ABI so that existing userspace libc code will work correctly. All kernel versions from v3.8-rc1 were affected. Signed-off-by: Michal Simek Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 403d2bb8a968..e23bb19e2a3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1679,6 +1679,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) +#elif defined(CONFIG_CLONE_BACKWARDS3) +SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, + int, stack_size, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, -- cgit v1.2.3