From 7614c3dc74733dff4b0e774f7a894b9ea6ec508c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 20:01:16 -0400 Subject: ftrace: Use schedule_on_each_cpu() as a heavy synchronize_sched() The function tracer uses preempt_disable/enable_notrace() for synchronization between reading registered ftrace_ops and unregistering them. Most of the ftrace_ops are global permanent structures that do not require this synchronization. That is, ops may be added and removed from the hlist but are never freed, and wont hurt if a synchronization is missed. But this is not true for dynamically created ftrace_ops or control_ops, which are used by the perf function tracing. The problem here is that the function tracer can be used to trace kernel/user context switches as well as going to and from idle. Basically, it can be used to trace blind spots of the RCU subsystem. This means that even though preempt_disable() is done, a synchronize_sched() will ignore CPUs that haven't made it out of user space or idle. These can include functions that are being traced just before entering or exiting the kernel sections. To implement the RCU synchronization, instead of using synchronize_sched() the use of schedule_on_each_cpu() is performed. This means that when a dynamically allocated ftrace_ops, or a control ops is being unregistered, all CPUs must be touched and execute a ftrace_sync() stub function via the work queues. This will rip CPUs out from idle or in dynamic tick mode. This only happens when a user disables perf function tracing or other dynamically allocated function tracers, but it allows us to continue to debug RCU and context tracking with function tracing. Link: http://lkml.kernel.org/r/1369785676.15552.55.camel@gandalf.local.home Cc: "Paul E. McKenney" Cc: Tejun Heo Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Peter Zijlstra Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..800a8a2fbddb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } -- cgit v1.2.3 From 52d85d763086594f139bf7d3a5641abeb91d9f57 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 12 Jun 2013 12:03:18 +0200 Subject: ftrace: Fix stddev calculation in function profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When FUNCTION_GRAPH_TRACER is enabled, ftrace can profile kernel functions and print basic statistics about them. Unfortunately, running stddev calculation is wrong. This patch corrects it implementing Welford’s method: s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) . Link: http://lkml.kernel.org/r/1371031398-24048-1-git-send-email-juri.lelli@gmail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 800a8a2fbddb..26e19105cdcc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -641,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v) if (rec->counter <= 1) stddev = 0; else { - stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + /* * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, (rec->counter - 1) * 1000); + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); -- cgit v1.2.3 From f1ed7c741fcd0c3d7d318e7c19813d89934b9296 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 27 Jun 2013 22:18:06 -0400 Subject: ftrace: Do not run selftest if command line parameter is set If the kernel command line ftrace filter parameters are set (ftrace_filter or ftrace_notrace), force the function self test to pass, with a warning why it was forced. If the user adds a filter to the kernel command line, it is assumed that they know what they are doing, and the self test should just not run instead of failing (which disables function tracing) or clearing the filter, as that will probably annoy the user. If the user wants the selftest to run, the message will tell them why it did not. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26e19105cdcc..67708f46baae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3537,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + static int __init set_ftrace_notrace(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } @@ -3546,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } -- cgit v1.2.3 From 195a8afc7ac962f8da795549fe38e825f1372b0d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 23 Jul 2013 22:06:15 -0400 Subject: ftrace: Add check for NULL regs if ops has SAVE_REGS set If a ftrace ops is registered with the SAVE_REGS flag set, and there's already a ops registered to one of its functions but without the SAVE_REGS flag, there's a small race window where the SAVE_REGS ops gets added to the list of callbacks to call for that function before the callback trampoline gets set to save the regs. The problem is, the function is not currently saving regs, which opens a small race window where the ops that is expecting regs to be passed to it, wont. This can cause a crash if the callback were to reference the regs, as the SAVE_REGS guarantees that regs will be set. To fix this, we add a check in the loop case where it checks if the ops has the SAVE_REGS flag set, and if so, it will ignore it if regs is not set. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 67708f46baae..8ce9eefc5bb4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1441,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, * the hashes are freed with call_rcu_sched(). */ static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_hash *filter_hash; struct ftrace_hash *notrace_hash; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); @@ -4218,7 +4228,7 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { return 1; } @@ -4241,7 +4251,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip)) + ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); @@ -4274,7 +4284,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip)) + if (ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); -- cgit v1.2.3 From 1c80c43290ee576afe8d39ecc905fa3958a5858c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 25 Jul 2013 20:22:00 -0400 Subject: ftrace: Consolidate some duplicate code for updating ftrace ops When ftrace ops modifies the functions that it will trace, the update to the function mcount callers may need to be modified. Consolidate the two places that do the checks to see if an update is required with a wrapper function for those checks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8ce9eefc5bb4..92d3334de0c3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3384,6 +3384,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } +static void ftrace_ops_update_code(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) @@ -3426,9 +3432,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(ops); mutex_unlock(&ftrace_lock); @@ -3655,9 +3660,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(iter->ops); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From 8c4f3c3fa9681dc549cd35419b259496082fef8b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 30 Jul 2013 00:04:32 -0400 Subject: ftrace: Check module functions being traced on reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's been a nasty bug that would show up and not give much info. The bug displayed the following warning: WARNING: at kernel/trace/ftrace.c:1529 __ftrace_hash_rec_update+0x1e3/0x230() Pid: 20903, comm: bash Tainted: G O 3.6.11+ #38405.trunk Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] __ftrace_hash_rec_update+0x1e3/0x230 [] ftrace_hash_move+0x28/0x1d0 [] ? kfree+0x2c/0x110 [] ftrace_regex_release+0x8e/0x150 [] __fput+0xae/0x220 [] ____fput+0xe/0x10 [] task_work_run+0x72/0x90 [] do_notify_resume+0x6c/0xc0 [] ? trace_hardirqs_on_thunk+0x3a/0x3c [] int_signal+0x12/0x17 ---[ end trace 793179526ee09b2c ]--- It was finally narrowed down to unloading a module that was being traced. It was actually more than that. When functions are being traced, there's a table of all functions that have a ref count of the number of active tracers attached to that function. When a function trace callback is registered to a function, the function's record ref count is incremented. When it is unregistered, the function's record ref count is decremented. If an inconsistency is detected (ref count goes below zero) the above warning is shown and the function tracing is permanently disabled until reboot. The ftrace callback ops holds a hash of functions that it filters on (and/or filters off). If the hash is empty, the default means to filter all functions (for the filter_hash) or to disable no functions (for the notrace_hash). When a module is unloaded, it frees the function records that represent the module functions. These records exist on their own pages, that is function records for one module will not exist on the same page as function records for other modules or even the core kernel. Now when a module unloads, the records that represents its functions are freed. When the module is loaded again, the records are recreated with a default ref count of zero (unless there's a callback that traces all functions, then they will also be traced, and the ref count will be incremented). The problem is that if an ftrace callback hash includes functions of the module being unloaded, those hash entries will not be removed. If the module is reloaded in the same location, the hash entries still point to the functions of the module but the module's ref counts do not reflect that. With the help of Steve and Joern, we found a reproducer: Using uinput module and uinput_release function. cd /sys/kernel/debug/tracing modprobe uinput echo uinput_release > set_ftrace_filter echo function > current_tracer rmmod uinput modprobe uinput # check /proc/modules to see if loaded in same addr, otherwise try again echo nop > current_tracer [BOOM] The above loads the uinput module, which creates a table of functions that can be traced within the module. We add uinput_release to the filter_hash to trace just that function. Enable function tracincg, which increments the ref count of the record associated to uinput_release. Remove uinput, which frees the records including the one that represents uinput_release. Load the uinput module again (and make sure it's at the same address). This recreates the function records all with a ref count of zero, including uinput_release. Disable function tracing, which will decrement the ref count for uinput_release which is now zero because of the module removal and reload, and we have a mismatch (below zero ref count). The solution is to check all currently tracing ftrace callbacks to see if any are tracing any of the module's functions when a module is loaded (it already does that with callbacks that trace all functions). If a callback happens to have a module function being traced, it increments that records ref count and starts tracing that function. There may be a strange side effect with this, where tracing module functions on unload and then reloading a new module may have that new module's functions being traced. This may be something that confuses the user, but it's not a big deal. Another approach is to disable all callback hashes on module unload, but this leaves some ftrace callbacks that may not be registered, but can still have hashes tracing the module's function where ftrace doesn't know about it. That situation can cause the same bug. This solution solves that case too. Another benefit of this solution, is it is possible to trace a module's function on unload and load. Link: http://lkml.kernel.org/r/20130705142629.GA325@redhat.com Reported-by: Jörn Engel Reported-by: Dave Jones Reported-by: Steve Hodgson Tested-by: Steve Hodgson Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 71 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 9 deletions(-) (limited to 'kernel/trace/ftrace.c') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92d3334de0c3..a6d098c6df3f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2169,12 +2169,57 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ops_traces_mod(struct ftrace_ops *ops) +static inline int ops_traces_mod(struct ftrace_ops *ops) { - struct ftrace_hash *hash; + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->filter_hash) && + ftrace_hash_empty(ops->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + /* If ops traces all mods, we already accounted for it */ + if (ops_traces_mod(ops)) + return 0; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->filter_hash) && + !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + return 0; - hash = ops->filter_hash; - return ftrace_hash_empty(hash); + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + return 0; + + return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; } static int ftrace_update_code(struct module *mod) @@ -2183,6 +2228,7 @@ static int ftrace_update_code(struct module *mod) struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + bool test = false; int i; /* @@ -2196,9 +2242,12 @@ static int ftrace_update_code(struct module *mod) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && - ops_traces_mod(ops)) - ref++; + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + if (ops_traces_mod(ops)) + ref++; + else + test = true; + } } } @@ -2208,12 +2257,16 @@ static int ftrace_update_code(struct module *mod) for (pg = ftrace_new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { + int cnt = ref; + /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - p->flags = ref; + if (test) + cnt += referenced_filters(p); + p->flags = cnt; /* * Do the initial record conversion from mcount jump @@ -2233,7 +2286,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up && ref) { + if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) ftrace_bug(failed, p->ip); -- cgit v1.2.3