diff options
Diffstat (limited to 'kernel')
168 files changed, 12631 insertions, 8410 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 790d83c7d160..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -5,4 +5,3 @@ config_data.h config_data.gz timeconst.h hz.bc -x509_certificate_list diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..53abf008ecb3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o +obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o @@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o @@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o + +obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h @@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) - -############################################################################### -# -# Roll all the X.509 certificates that we can find together and pull them into -# the kernel so that they get loaded into the system trusted keyring during -# boot. -# -# We look in the source root and the build root for all files whose name ends -# in ".x509". Unfortunately, this will generate duplicate filenames, so we -# have make canonicalise the pathnames and then sort them to discard the -# duplicates. -# -############################################################################### -ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) -X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) -X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509 -X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ - $(or $(realpath $(CERT)),$(CERT)))) -X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw)) - -ifeq ($(X509_CERTIFICATES),) -$(warning *** No X.509 certificates found ***) -endif - -ifneq ($(wildcard $(obj)/.x509.list),) -ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) -$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)") -$(shell rm $(obj)/.x509.list) -endif -endif - -kernel/system_certificates.o: $(obj)/x509_certificate_list - -quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") - -targets += $(obj)/x509_certificate_list -$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list - $(call if_changed,x509certs) - -targets += $(obj)/.x509.list -$(obj)/.x509.list: - @echo $(X509_CERTIFICATES) >$@ -endif - -clean-files := x509_certificate_list .x509.list - -ifeq ($(CONFIG_MODULE_SIG),y) -############################################################################### -# -# If module signing is requested, say by allyesconfig, but a key has not been -# supplied, then one will need to be generated to make sure the build does not -# fail and that the kernel may be used afterwards. -# -############################################################################### -ifndef CONFIG_MODULE_SIG_HASH -$(error Could not determine digest type to use from kernel config) -endif - -signing_key.priv signing_key.x509: x509.genkey - @echo "###" - @echo "### Now generating an X.509 key pair to be used for signing modules." - @echo "###" - @echo "### If this takes a long time, you might wish to run rngd in the" - @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and uses a hardware random" - @echo "### number generator if one is available." - @echo "###" - openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ - -batch -x509 -config x509.genkey \ - -outform DER -out signing_key.x509 \ - -keyout signing_key.priv 2>&1 - @echo "###" - @echo "### Key pair generated." - @echo "###" - -x509.genkey: - @echo Generating X.509 key generation config - @echo >x509.genkey "[ req ]" - @echo >>x509.genkey "default_bits = 4096" - @echo >>x509.genkey "distinguished_name = req_distinguished_name" - @echo >>x509.genkey "prompt = no" - @echo >>x509.genkey "string_mask = utf8only" - @echo >>x509.genkey "x509_extensions = myexts" - @echo >>x509.genkey - @echo >>x509.genkey "[ req_distinguished_name ]" - @echo >>x509.genkey "#O = Unspecified company" - @echo >>x509.genkey "CN = Build time autogenerated kernel key" - @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company" - @echo >>x509.genkey - @echo >>x509.genkey "[ myexts ]" - @echo >>x509.genkey "basicConstraints=critical,CA:FALSE" - @echo >>x509.genkey "keyUsage=digitalSignature" - @echo >>x509.genkey "subjectKeyIdentifier=hash" - @echo >>x509.genkey "authorityKeyIdentifier=keyid" -endif diff --git a/kernel/audit.c b/kernel/audit.c index f9e6065346db..8a056a32ded7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb) static void kauditd_send_skb(struct sk_buff *skb) { int err; + int attempts = 0; +#define AUDITD_RETRIES 5 + +restart: /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ + pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", + audit_pid, err); if (audit_pid) { - pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared"); - audit_pid = 0; - audit_sock = NULL; + if (err == -ECONNREFUSED || err == -EPERM + || ++attempts >= AUDITD_RETRIES) { + char s[32]; + + snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); + audit_log_lost(s); + audit_pid = 0; + audit_sock = NULL; + } else { + pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", + attempts, audit_pid); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + goto restart; + } } /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); @@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { - int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; - return rc; + return; } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) - return rc; + return; audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); - - return rc; } int is_audit_feature_set(int i) @@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string, * @string: string to be checked * @len: max length of the string to check */ -int audit_string_contains_control(const char *string, size_t len) +bool audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len; p++) { if (*p == '"' || *p < 0x21 || *p > 0x7e) - return 1; + return true; } - return 0; + return false; } /** @@ -1761,7 +1775,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) + if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", diff --git a/kernel/audit.h b/kernel/audit.h index d641f9bb3ed0..de6cbb7cf547 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -50,6 +50,7 @@ enum audit_state { /* Rule lists */ struct audit_watch; +struct audit_fsnotify_mark; struct audit_tree; struct audit_chunk; @@ -252,6 +253,7 @@ struct audit_net { extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; +extern int audit_del_rule(struct audit_entry *); extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; @@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); + +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern char *audit_mark_path(struct audit_fsnotify_mark *mark); +extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); +extern void audit_remove_mark_rule(struct audit_krule *krule); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); + #else #define audit_put_watch(w) {} #define audit_get_watch(w) {} @@ -278,12 +289,19 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 +#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_mark_path(m) "" +#define audit_remove_mark(m) +#define audit_remove_mark_rule(k) +#define audit_mark_compare(m, i, d) 0 +#define audit_exe_compare(t, m) (-EINVAL) +#define audit_dupe_exe(n, o) (-EINVAL) #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); extern void audit_put_chunk(struct audit_chunk *); -extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); +extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); extern int audit_make_tree(struct audit_krule *, char *, u32); extern int audit_add_tree_rule(struct audit_krule *); extern int audit_remove_tree_rule(struct audit_krule *); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c new file mode 100644 index 000000000000..27c6046c2c3d --- /dev/null +++ b/kernel/audit_fsnotify.c @@ -0,0 +1,216 @@ +/* audit_fsnotify.c -- tracking inodes + * + * Copyright 2003-2009,2014-2015 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/fsnotify_backend.h> +#include <linux/namei.h> +#include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/security.h> +#include "audit.h" + +/* + * this mark lives on the parent directory of the inode in question. + * but dev, ino, and path are about the child + */ +struct audit_fsnotify_mark { + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + char *path; /* insertion path */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ + struct audit_krule *rule; +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_fsnotify_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark) +{ + kfree(audit_mark->path); + kfree(audit_mark); +} + +static void audit_fsnotify_free_mark(struct fsnotify_mark *mark) +{ + struct audit_fsnotify_mark *audit_mark; + + audit_mark = container_of(mark, struct audit_fsnotify_mark, mark); + audit_fsnotify_mark_free(audit_mark); +} + +char *audit_mark_path(struct audit_fsnotify_mark *mark) +{ + return mark->path; +} + +int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +{ + if (mark->ino == AUDIT_INO_UNSET) + return 0; + return (mark->ino == ino) && (mark->dev == dev); +} + +static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, + struct inode *inode) +{ + audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; + audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; +} + +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +{ + struct audit_fsnotify_mark *audit_mark; + struct path path; + struct dentry *dentry; + struct inode *inode; + int ret; + + if (pathname[0] != '/' || pathname[len-1] == '/') + return ERR_PTR(-EINVAL); + + dentry = kern_path_locked(pathname, &path); + if (IS_ERR(dentry)) + return (void *)dentry; /* returning an error */ + inode = path.dentry->d_inode; + mutex_unlock(&inode->i_mutex); + + audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); + if (unlikely(!audit_mark)) { + audit_mark = ERR_PTR(-ENOMEM); + goto out; + } + + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + audit_mark->mark.mask = AUDIT_FS_EVENTS; + audit_mark->path = pathname; + audit_update_mark(audit_mark, dentry->d_inode); + audit_mark->rule = krule; + + ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + if (ret < 0) { + audit_fsnotify_mark_free(audit_mark); + audit_mark = ERR_PTR(ret); + } +out: + dput(dentry); + path_put(&path); + return audit_mark; +} + +static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op) +{ + struct audit_buffer *ab; + struct audit_krule *rule = audit_mark->rule; + + if (!audit_enabled) + return; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + audit_log_format(ab, "auid=%u ses=%u op=", + from_kuid(&init_user_ns, audit_get_loginuid(current)), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, audit_mark->path); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); +} + +void audit_remove_mark(struct audit_fsnotify_mark *audit_mark) +{ + fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group); + fsnotify_put_mark(&audit_mark->mark); +} + +void audit_remove_mark_rule(struct audit_krule *krule) +{ + struct audit_fsnotify_mark *mark = krule->exe; + + audit_remove_mark(mark); +} + +static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) +{ + struct audit_krule *rule = audit_mark->rule; + struct audit_entry *entry = container_of(rule, struct audit_entry, rule); + + audit_mark_log_rule_change(audit_mark, "autoremove_rule"); + audit_del_rule(entry); +} + +/* Update mark data in audit rules based on fsnotify events. */ +static int audit_mark_handle_event(struct fsnotify_group *group, + struct inode *to_tell, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) +{ + struct audit_fsnotify_mark *audit_mark; + struct inode *inode = NULL; + + audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); + + BUG_ON(group != audit_fsnotify_group); + + switch (data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = ((struct path *)data)->dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = (struct inode *)data; + break; + default: + BUG(); + return 0; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + return 0; + audit_update_mark(audit_mark, inode); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_autoremove_mark_rule(audit_mark); + + return 0; +} + +static const struct fsnotify_ops audit_mark_fsnotify_ops = { + .handle_event = audit_mark_handle_event, +}; + +static int __init audit_fsnotify_init(void) +{ + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + if (IS_ERR(audit_fsnotify_group)) { + audit_fsnotify_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_fsnotify_init); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0f9877273fc..5efe9b299a12 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) return NULL; } -int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) +bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) { int n; for (n = 0; n < chunk->count; n++) if (chunk->owners[n].owner == tree) - return 1; - return 0; + return true; + return false; } /* tagging and untagging inodes with trees */ @@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree) if (rule->tree) { /* not a half-baked one */ audit_tree_log_remove_rule(rule); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 6e30024d9aac..656c7e93ac0d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch) int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return (watch->ino != (unsigned long)-1) && + return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && (watch->dev == dev); } @@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path) INIT_LIST_HEAD(&watch->rules); atomic_set(&watch->count, 1); watch->path = path; - watch->dev = (dev_t)-1; - watch->ino = (unsigned long)-1; + watch->dev = AUDIT_DEV_UNSET; + watch->ino = AUDIT_INO_UNSET; return watch; } @@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) if (IS_ERR(watch)) return PTR_ERR(watch); - audit_get_watch(watch); krule->watch = watch; return 0; @@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent, list_replace(&oentry->rule.list, &nentry->rule.list); } + if (oentry->rule.exe) + audit_remove_mark(oentry->rule.exe); audit_watch_log_rule_change(r, owatch, "updated_rules"); @@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); audit_watch_log_rule_change(r, w, "remove_rule"); + if (e->rule.exe) + audit_remove_mark(e->rule.exe); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); @@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule, watch_found = 1; - /* put krule's and initial refs to temporary watch */ - audit_put_watch(watch); + /* put krule's ref to temporary watch */ audit_put_watch(watch); audit_get_watch(w); krule->watch = watch = w; + + audit_put_parent(parent); break; } if (!watch_found) { - audit_get_parent(parent); watch->parent = parent; + audit_get_watch(watch); list_add(&watch->wlist, &parent->watches); } list_add(&krule->rlist, &watch->rules); @@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - /* match get in audit_find_parent or audit_init_parent */ - audit_put_parent(parent); - h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: @@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -517,3 +518,36 @@ static int __init audit_watch_init(void) return 0; } device_initcall(audit_watch_init); + +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +{ + struct audit_fsnotify_mark *audit_mark; + char *pathname; + + pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + if (IS_ERR(audit_mark)) { + kfree(pathname); + return PTR_ERR(audit_mark); + } + new->exe = audit_mark; + + return 0; +} + +int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) +{ + struct file *exe_file; + unsigned long ino; + dev_t dev; + + rcu_read_lock(); + exe_file = rcu_dereference(tsk->mm->exe_file); + ino = exe_file->f_inode->i_ino; + dev = exe_file->f_inode->i_sb->s_dev; + rcu_read_unlock(); + return audit_mark_compare(mark, ino, dev); +} diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 72e1660a79a3..b8ff9e193753 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -39,13 +39,13 @@ * Locking model: * * audit_filter_mutex: - * Synchronizes writes and blocking reads of audit's filterlist - * data. Rcu is used to traverse the filterlist and access - * contents of structs audit_entry, audit_watch and opaque - * LSM rules during filtering. If modified, these structures - * must be copied and replace their counterparts in the filterlist. - * An audit_parent struct is not accessed during filtering, so may - * be written directly provided audit_filter_mutex is held. + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * LSM rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. */ /* Audit filter lists, defined in <linux/audit.h> */ @@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; + case AUDIT_EXE: + if (f->op != Audit_equal) + return -EINVAL; + if (entry->rule.listnr != AUDIT_FILTER_EXIT) + return -EINVAL; + break; }; return 0; } @@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t remain = datasz - sizeof(struct audit_rule_data); int i; char *str; + struct audit_fsnotify_mark *audit_mark; entry = audit_to_entry_common(data); if (IS_ERR(entry)) @@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; + case AUDIT_EXE: + if (entry->rule.exe || f->val > PATH_MAX) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) { + err = PTR_ERR(str); + goto exit_free; + } + entry->rule.buflen += f->val; + + audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + if (IS_ERR(audit_mark)) { + kfree(str); + err = PTR_ERR(audit_mark); + goto exit_free; + } + entry->rule.exe = audit_mark; + break; } } @@ -549,10 +574,10 @@ exit_nofree: return entry; exit_free: - if (entry->rule.watch) - audit_put_watch(entry->rule.watch); /* matches initial get */ if (entry->rule.tree) audit_put_tree(entry->rule.tree); /* that's the temporary one */ + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); /* that's the template one */ audit_free_rule(entry); return ERR_PTR(err); } @@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, krule->filterkey); break; + case AUDIT_EXE: + data->buflen += data->values[i] = + audit_pack_string(&bufp, audit_mark_path(krule->exe)); + break; case AUDIT_LOGINUID_SET: if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { data->fields[i] = AUDIT_LOGINUID; @@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->filterkey, b->filterkey)) return 1; break; + case AUDIT_EXE: + /* both paths exist based on above type compare */ + if (strcmp(audit_mark_path(a->exe), + audit_mark_path(b->exe))) + return 1; + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) err = -ENOMEM; else new->filterkey = fk; + break; + case AUDIT_EXE: + err = audit_dupe_exe(new, old); + break; } if (err) { + if (new->exe) + audit_remove_mark(new->exe); audit_free_rule(entry); return ERR_PTR(err); } @@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int err; + int err = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry) /* normally audit_add_tree_rule() will free it on failure */ if (tree) audit_put_tree(tree); - goto error; + return err; } if (watch) { @@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry) */ if (tree) audit_put_tree(tree); - goto error; + return err; } } if (tree) { err = audit_add_tree_rule(&entry->rule); if (err) { mutex_unlock(&audit_filter_mutex); - goto error; + return err; } } @@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - return 0; - -error: - if (watch) - audit_put_watch(watch); /* tmp watch, matches initial get */ return err; } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry) +int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; int ret = 0; @@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry) mutex_lock(&audit_filter_mutex); e = audit_find_rule(entry, &list); if (!e) { - mutex_unlock(&audit_filter_mutex); ret = -ENOENT; goto out; } @@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry) if (e->rule.tree) audit_remove_tree_rule(&e->rule); - list_del_rcu(&e->list); - list_del(&e->rule.list); - call_rcu(&e->rcu, audit_free_rule_rcu); + if (e->rule.exe) + audit_remove_mark_rule(&e->rule); #ifdef CONFIG_AUDITSYSCALL if (!dont_count) @@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry) if (!audit_match_signal(entry)) audit_signals--; #endif - mutex_unlock(&audit_filter_mutex); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); out: - if (watch) - audit_put_watch(watch); /* match initial get */ + mutex_unlock(&audit_filter_mutex); + if (tree) audit_put_tree(tree); /* that's the temporary one */ @@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, WARN_ON(1); } - if (err || type == AUDIT_DEL_RULE) + if (err || type == AUDIT_DEL_RULE) { + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); audit_free_rule(entry); + } return err; } @@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r) return 0; nentry = audit_dupe_rule(r); + if (entry->rule.exe) + audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e85bdfd15fed..b86cc04959de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val) return 0; list_for_each_entry(n, &ctx->names_list, list) { - if ((n->ino != -1) && + if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } @@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(ctx->ppid, f->op, f->val); } break; + case AUDIT_EXE: + result = audit_exe_compare(tsk, rule->exe); + break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; @@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, aname->should_free = true; } - aname->ino = (unsigned long)-1; + aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); @@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent, if (inode) audit_copy_inode(found_child, dentry, inode); else - found_child->ino = (unsigned long)-1; + found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6983be12bd3..13272582eee0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,2 +1,4 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o + +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229a6fa4..3f4c99e06c6b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> +#include <linux/perf_event.h> /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) @@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; return &array->map; @@ -150,15 +151,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +168,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) - BUG_ON(array->prog[i] != NULL); + BUG_ON(array->ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +192,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (!bpf_prog_array_compatible(array, prog)) { - bpf_prog_put(prog); - return -EINVAL; - } + new_ptr = map->ops->map_fd_get_ptr(map, ufd); + if (IS_ERR(new_ptr)) + return PTR_ERR(new_ptr); - old_prog = xchg(array->prog + index, prog); - if (old_prog) - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, new_ptr); + if (old_ptr) + map->ops->map_fd_put_ptr(old_ptr); return 0; } -static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *old_prog; + void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; - old_prog = xchg(array->prog + index, NULL); - if (old_prog) { - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, NULL); + if (old_ptr) { + map->ops->map_fd_put_ptr(old_ptr); return 0; } else { return -ENOENT; } } +static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) + return prog; + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + return prog; +} + +static void prog_fd_array_put_ptr(void *ptr) +{ + struct bpf_prog *prog = ptr; + + bpf_prog_put_rcu(prog); +} + /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_prog_array_map_clear(struct bpf_map *map) +void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - prog_array_map_delete_elem(map, &i); + fd_array_map_delete_elem(map, &i); } static const struct bpf_map_ops prog_array_ops = { - .map_alloc = prog_array_map_alloc, - .map_free = prog_array_map_free, + .map_alloc = fd_array_map_alloc, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, - .map_lookup_elem = prog_array_map_lookup_elem, - .map_update_elem = prog_array_map_update_elem, - .map_delete_elem = prog_array_map_delete_elem, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = prog_fd_array_get_ptr, + .map_fd_put_ptr = prog_fd_array_put_ptr, }; static struct bpf_map_type_list prog_array_type __read_mostly = { @@ -255,3 +274,69 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + goto err; + + if (attr->inherit) + goto err; + + if (attr->type == PERF_TYPE_RAW) + return event; + + if (attr->type == PERF_TYPE_HARDWARE) + return event; + + if (attr->type == PERF_TYPE_SOFTWARE && + attr->config == PERF_COUNT_SW_BPF_OUTPUT) + return event; +err: + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = &perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(&perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c5bedc82bc1c..334b1bdd572c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; + kmemcheck_annotate_bitfield(fp, meta); + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->prog = fp; return fp; } @@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; + fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. @@ -177,6 +183,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } +EXPORT_SYMBOL_GPL(__bpf_call_base); /** * __bpf_prog_run - run eBPF program on a given context @@ -449,11 +456,15 @@ select_insn: tail_call_cnt++; - prog = READ_ONCE(array->prog[index]); + prog = READ_ONCE(array->ptrs[index]); if (unlikely(!prog)) goto out; - ARG1 = BPF_R1; + /* ARG1 at this point is guaranteed to point to CTX from + * the verifier side due to the fact that the tail call is + * handeled like a helper, that is, bpf_tail_call_proto, + * where arg1_type is ARG_PTR_TO_CTX. + */ insn = prog->insnsi; goto select_insn; out: @@ -717,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp) struct bpf_prog_aux *aux = fp->aux; INIT_WORK(&aux->work, bpf_prog_free_deferred); - aux->prog = fp; schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* RNG for unpriviledged user space with separated state from prandom_u32(). */ +static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); + +void bpf_user_rnd_init_once(void) +{ + prandom_init_once(&bpf_user_rnd_state); +} + +u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* Should someone ever have the rather unwise idea to use some + * of the registers passed into this function, then note that + * this function is called from native eBPF and classic-to-eBPF + * transformations. Register assignments from both sides are + * different, f.e. classic always sets fn(ctx, A, X) here. + */ + struct rnd_state *state; + u32 res; + + state = &get_cpu_var(bpf_user_rnd_state); + res = prandom_u32_state(state); + put_cpu_var(state); + + return res; +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83c209d9b17a..19909b22b4f8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -17,7 +17,7 @@ struct bpf_htab { struct bpf_map map; struct hlist_head *buckets; - spinlock_t lock; + raw_spinlock_t lock; u32 count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) for (i = 0; i < htab->n_buckets; i++) INIT_HLIST_HEAD(&htab->buckets[i]); - spin_lock_init(&htab->lock); + raw_spin_lock_init(&htab->lock); htab->count = 0; htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8) + htab->map.value_size; + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->elem_size * htab->map.max_entries, + PAGE_SIZE) >> PAGE_SHIFT; return &htab->map; free_htab: @@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, l_new->hash = htab_map_hash(l_new->key, key_size); /* bpf_map_update_elem() can be called in_irq() */ - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, l_new->hash); @@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } else { htab->count++; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return 0; err: - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); kfree(l_new); return ret; } @@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) hash = htab_map_hash(key, key_size); - spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&htab->lock, flags); head = select_bucket(htab, hash); @@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&htab->lock, flags); return ret; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1447ec09421e..4504ca66118d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return prandom_u32(); -} - const struct bpf_func_proto bpf_get_prandom_u32_proto = { - .func = bpf_get_prandom_u32, + .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c new file mode 100644 index 000000000000..be6d726e31c9 --- /dev/null +++ b/kernel/bpf/inode.c @@ -0,0 +1,387 @@ +/* + * Minimal file system backend for holding eBPF maps and programs, + * used by bpf(2) object pinning. + * + * Authors: + * + * Daniel Borkmann <daniel@iogearbox.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/magic.h> +#include <linux/major.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/fs.h> +#include <linux/kdev_t.h> +#include <linux/filter.h> +#include <linux/bpf.h> + +enum bpf_type { + BPF_TYPE_UNSPEC = 0, + BPF_TYPE_PROG, + BPF_TYPE_MAP, +}; + +static void *bpf_any_get(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + break; + case BPF_TYPE_MAP: + atomic_inc(&((struct bpf_map *)raw)->refcnt); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return raw; +} + +static void bpf_any_put(void *raw, enum bpf_type type) +{ + switch (type) { + case BPF_TYPE_PROG: + bpf_prog_put(raw); + break; + case BPF_TYPE_MAP: + bpf_map_put(raw); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) +{ + void *raw; + + *type = BPF_TYPE_MAP; + raw = bpf_map_get(ufd); + if (IS_ERR(raw)) { + *type = BPF_TYPE_PROG; + raw = bpf_prog_get(ufd); + } + + return raw; +} + +static const struct inode_operations bpf_dir_iops; + +static const struct inode_operations bpf_prog_iops = { }; +static const struct inode_operations bpf_map_iops = { }; + +static struct inode *bpf_get_inode(struct super_block *sb, + const struct inode *dir, + umode_t mode) +{ + struct inode *inode; + + switch (mode & S_IFMT) { + case S_IFDIR: + case S_IFREG: + break; + default: + return ERR_PTR(-EINVAL); + } + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOSPC); + + inode->i_ino = get_next_ino(); + inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime; + inode->i_ctime = inode->i_atime; + + inode_init_owner(inode, dir, mode); + + return inode; +} + +static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) +{ + *type = BPF_TYPE_UNSPEC; + if (inode->i_op == &bpf_prog_iops) + *type = BPF_TYPE_PROG; + else if (inode->i_op == &bpf_map_iops) + *type = BPF_TYPE_MAP; + else + return -EACCES; + + return 0; +} + +static bool bpf_dname_reserved(const struct dentry *dentry) +{ + return strchr(dentry->d_name.name, '.'); +} + +static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + + inc_nlink(inode); + inc_nlink(dir); + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct inode_operations *iops) +{ + struct inode *inode; + + if (bpf_dname_reserved(dentry)) + return -EPERM; + + inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = iops; + inode->i_private = dentry->d_fsdata; + + d_instantiate(dentry, inode); + dget(dentry); + + return 0; +} + +static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t devt) +{ + enum bpf_type type = MINOR(devt); + + if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || + dentry->d_fsdata == NULL) + return -EPERM; + + switch (type) { + case BPF_TYPE_PROG: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); + case BPF_TYPE_MAP: + return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); + default: + return -EPERM; + } +} + +static const struct inode_operations bpf_dir_iops = { + .lookup = simple_lookup, + .mknod = bpf_mkobj, + .mkdir = bpf_mkdir, + .rmdir = simple_rmdir, + .unlink = simple_unlink, +}; + +static int bpf_obj_do_pin(const struct filename *pathname, void *raw, + enum bpf_type type) +{ + struct dentry *dentry; + struct inode *dir; + struct path path; + umode_t mode; + dev_t devt; + int ret; + + dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); + devt = MKDEV(UNNAMED_MAJOR, type); + + ret = security_path_mknod(&path, dentry, mode, devt); + if (ret) + goto out; + + dir = d_inode(path.dentry); + if (dir->i_op != &bpf_dir_iops) { + ret = -EPERM; + goto out; + } + + dentry->d_fsdata = raw; + ret = vfs_mknod(dir, dentry, mode, devt); + dentry->d_fsdata = NULL; +out: + done_path_create(&path, dentry); + return ret; +} + +int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +{ + struct filename *pname; + enum bpf_type type; + void *raw; + int ret; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_fd_probe_obj(ufd, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + ret = bpf_obj_do_pin(pname, raw, type); + if (ret != 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void *bpf_obj_do_get(const struct filename *pathname, + enum bpf_type *type) +{ + struct inode *inode; + struct path path; + void *raw; + int ret; + + ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + inode = d_backing_inode(path.dentry); + ret = inode_permission(inode, MAY_WRITE); + if (ret) + goto out; + + ret = bpf_inode_type(inode, type); + if (ret) + goto out; + + raw = bpf_any_get(inode->i_private, *type); + touch_atime(&path); + + path_put(&path); + return raw; +out: + path_put(&path); + return ERR_PTR(ret); +} + +int bpf_obj_get_user(const char __user *pathname) +{ + enum bpf_type type = BPF_TYPE_UNSPEC; + struct filename *pname; + int ret = -ENOENT; + void *raw; + + pname = getname(pathname); + if (IS_ERR(pname)) + return PTR_ERR(pname); + + raw = bpf_obj_do_get(pname, &type); + if (IS_ERR(raw)) { + ret = PTR_ERR(raw); + goto out; + } + + if (type == BPF_TYPE_PROG) + ret = bpf_prog_new_fd(raw); + else if (type == BPF_TYPE_MAP) + ret = bpf_map_new_fd(raw); + else + goto out; + + if (ret < 0) + bpf_any_put(raw, type); +out: + putname(pname); + return ret; +} + +static void bpf_evict_inode(struct inode *inode) +{ + enum bpf_type type; + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); +} + +static const struct super_operations bpf_super_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = bpf_evict_inode, +}; + +static int bpf_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr bpf_rfiles[] = { { "" } }; + struct inode *inode; + int ret; + + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); + if (ret) + return ret; + + sb->s_op = &bpf_super_ops; + + inode = sb->s_root->d_inode; + inode->i_op = &bpf_dir_iops; + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= S_ISVTX | S_IRWXUGO; + + return 0; +} + +static struct dentry *bpf_mount(struct file_system_type *type, int flags, + const char *dev_name, void *data) +{ + return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); +} + +static struct file_system_type bpf_fs_type = { + .owner = THIS_MODULE, + .name = "bpf", + .mount = bpf_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +MODULE_ALIAS_FS("bpf"); + +static int __init bpf_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, "bpf"); + if (ret) + return ret; + + ret = register_filesystem(&bpf_fs_type); + if (ret) + sysfs_remove_mount_point(fs_kobj, "bpf"); + + return ret; +} +fs_initcall(bpf_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1b14d197a4f..0d3313d02a7e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +int sysctl_unprivileged_bpf_disabled __read_mostly; + static LIST_HEAD(bpf_map_types); static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +static int bpf_map_charge_memlock(struct bpf_map *map) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(map->pages, &user->locked_vm); + + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + map->user = user; + return 0; +} + +static void bpf_map_uncharge_memlock(struct bpf_map *map) +{ + struct user_struct *user = map->user; + + atomic_long_sub(map->pages, &user->locked_vm); + free_uid(user); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + bpf_map_uncharge_memlock(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -72,7 +101,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) /* prog_array stores refcnt-ed bpf_prog pointers * release them all when user space closes prog_array_fd */ - bpf_prog_array_map_clear(map); + bpf_fd_array_map_clear(map); bpf_map_put(map); return 0; @@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = { .release = bpf_map_release, }; +int bpf_map_new_fd(struct bpf_map *map) +{ + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, + O_RDWR | O_CLOEXEC); +} + /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ @@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); - err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + err = bpf_map_charge_memlock(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ goto free_map; @@ -124,19 +162,29 @@ free_map: /* if error is returned, fd is released. * On success caller should complete fd access with matching fdput() */ -struct bpf_map *bpf_map_get(struct fd f) +struct bpf_map *__bpf_map_get(struct fd f) { - struct bpf_map *map; - if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { fdput(f); return ERR_PTR(-EINVAL); } - map = f.file->private_data; + return f.file->private_data; +} + +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + atomic_inc(&map->refcnt); + fdput(f); return map; } @@ -155,15 +203,16 @@ static int map_lookup_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value, *ptr; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -213,15 +262,16 @@ static int map_update_elem(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *uvalue = u64_to_ptr(attr->value); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *value; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -265,15 +315,16 @@ static int map_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_ptr(attr->key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; + struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -305,15 +356,16 @@ static int map_get_next_key(union bpf_attr *attr) void __user *ukey = u64_to_ptr(attr->key); void __user *unext_key = u64_to_ptr(attr->next_key); int ufd = attr->map_fd; - struct fd f = fdget(ufd); struct bpf_map *map; void *key, *next_key; + struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; - map = bpf_map_get(f); + f = fdget(ufd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -398,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog) */ BUG_ON(!prog->aux->ops->get_func_proto); + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in @@ -432,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -static void __prog_put_rcu(struct rcu_head *rcu) +static int bpf_prog_charge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + atomic_long_add(prog->pages, &user->locked_vm); + if (atomic_long_read(&user->locked_vm) > memlock_limit) { + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); + return -EPERM; + } + prog->aux->user = user; + return 0; +} + +static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) +{ + struct user_struct *user = prog->aux->user; + + atomic_long_sub(prog->pages, &user->locked_vm); + free_uid(user); +} + +static void __prog_put_common(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); free_used_maps(aux); + bpf_prog_uncharge_memlock(aux->prog); bpf_prog_free(aux->prog); } /* version of bpf_prog_put() that is called after a grace period */ void bpf_prog_put_rcu(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - prog->aux->prog = prog; - call_rcu(&prog->aux->rcu, __prog_put_rcu); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + call_rcu(&prog->aux->rcu, __prog_put_common); } void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { - free_used_maps(prog->aux); - bpf_prog_free(prog); - } + if (atomic_dec_and_test(&prog->aux->refcnt)) + __prog_put_common(&prog->aux->rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -470,21 +548,22 @@ static const struct file_operations bpf_prog_fops = { .release = bpf_prog_release, }; -static struct bpf_prog *get_prog(struct fd f) +int bpf_prog_new_fd(struct bpf_prog *prog) { - struct bpf_prog *prog; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, + O_RDWR | O_CLOEXEC); +} +static struct bpf_prog *__bpf_prog_get(struct fd f) +{ if (!f.file) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { fdput(f); return ERR_PTR(-EINVAL); } - prog = f.file->private_data; - - return prog; + return f.file->private_data; } /* called by sockets/tracing/seccomp before attaching program to an event @@ -495,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = get_prog(f); - + prog = __bpf_prog_get(f); if (IS_ERR(prog)) return prog; atomic_inc(&prog->aux->refcnt); fdput(f); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get); @@ -536,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; + if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_nouncharge; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -549,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; prog->orig_prog = NULL; - prog->jited = false; + prog->jited = 0; atomic_set(&prog->aux->refcnt, 1); - prog->gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -572,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ goto free_used_maps; @@ -582,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr) free_used_maps: free_used_maps(prog->aux); free_prog: + bpf_prog_uncharge_memlock(prog); +free_prog_nouncharge: bpf_prog_free(prog); return err; } +#define BPF_OBJ_LAST_FIELD bpf_fd + +static int bpf_obj_pin(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ)) + return -EINVAL; + + return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +} + +static int bpf_obj_get(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + return -EINVAL; + + return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; int err; - /* the syscall is limited to root temporarily. This restriction will be - * lifted when security audit is clean. Note that eBPF+tracing must have - * this restriction, since it may pass kernel data to user space - */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) return -EPERM; if (!access_ok(VERIFY_READ, uattr, 1)) @@ -650,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_LOAD: err = bpf_prog_load(&attr); break; + case BPF_OBJ_PIN: + err = bpf_obj_pin(&attr); + break; + case BPF_OBJ_GET: + err = bpf_obj_get(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 039d866fd36a..c6073056badf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -199,6 +199,7 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + bool allow_ptr_leaks; }; /* verbose verifier prints what it's seeing @@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static void verbose(const char *fmt, ...) +static __printf(1, 2) void verbose(const char *fmt, ...) { va_list args; @@ -238,6 +239,15 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -275,7 +285,7 @@ static const char *const bpf_class_string[] = { [BPF_ALU64] = "alu64", }; -static const char *const bpf_alu_string[] = { +static const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", @@ -299,7 +309,7 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; -static const char *const bpf_jmp_string[] = { +static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", @@ -530,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size) return -EINVAL; } +static bool is_spillable_regtype(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_MAP_VALUE: + case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_STACK: + case PTR_TO_CTX: + case FRAME_PTR: + case CONST_PTR_TO_MAP: + return true; + default: + return false; + } +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -542,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, */ if (value_regno >= 0 && - (state->regs[value_regno].type == PTR_TO_MAP_VALUE || - state->regs[value_regno].type == PTR_TO_STACK || - state->regs[value_regno].type == PTR_TO_CTX)) { + is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -635,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } +static bool is_pointer_value(struct verifier_env *env, int regno) +{ + if (env->allow_ptr_leaks) + return false; + + switch (env->cur_state.regs[regno].type) { + case UNKNOWN_VALUE: + case CONST_IMM: + return false; + default: + return true; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -648,6 +685,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, struct verifier_state *state = &env->cur_state; int size, err = 0; + if (state->regs[regno].type == PTR_TO_STACK) + off += state->regs[regno].imm; + size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; @@ -658,24 +698,42 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into map\n", value_regno); + return -EACCES; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); } else if (state->regs[regno].type == PTR_TO_CTX) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into ctx\n", value_regno); + return -EACCES; + } err = check_ctx_access(env, off, size, t); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == FRAME_PTR) { + } else if (state->regs[regno].type == FRAME_PTR || + state->regs[regno].type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } - if (t == BPF_WRITE) + if (t == BPF_WRITE) { + if (!env->allow_ptr_leaks && + state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } err = check_stack_write(state, off, size, value_regno); - else + } else { err = check_stack_read(state, off, size, value_regno); + } } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[state->regs[regno].type]); @@ -763,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_ANYTHING) { + if (is_pointer_value(env, regno)) { + verbose("R%d leaks addr into helper function\n", regno); + return -EACCES; + } return 0; + } if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -833,6 +896,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i < ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_func && bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -908,28 +993,17 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: - * only allow to pass it into bpf_tail_call() for now. - * bpf_map_delete_elem() can be allowed in the future, - * while bpf_map_update_elem() must only be done via syscall - */ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call && - map->map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into - * bpf_tail_call() - */ - return -EINVAL; + err = check_map_func_compatibility(map, func_id); + if (err) + return err; return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { + struct reg_state *regs = env->cur_state.regs; u8 opcode = BPF_OP(insn->code); int err; @@ -954,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } + /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); if (err) @@ -990,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) */ regs[insn->dst_reg] = regs[insn->src_reg]; } else { + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d partial copy of pointer\n", + insn->src_reg); + return -EACCES; + } regs[insn->dst_reg].type = UNKNOWN_VALUE; regs[insn->dst_reg].map_ptr = NULL; } @@ -1039,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) + BPF_SRC(insn->code) == BPF_K) { stack_relative = true; + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->dst_reg); + return -EACCES; + } else if (BPF_SRC(insn->code) == BPF_X && + is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer arithmetic prohibited\n", + insn->src_reg); + return -EACCES; + } /* check dest operand */ err = check_reg_arg(regs, insn->dst_reg, DST_OP); @@ -1079,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env, err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d pointer comparison prohibited\n", + insn->src_reg); + return -EACCES; + } } else { if (insn->src_reg != BPF_REG_0) { verbose("BPF_JMP uses reserved fields\n"); @@ -1133,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env, regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = 0; } + } else if (is_pointer_value(env, insn->dst_reg)) { + verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + return -EACCES; } else if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE)) { @@ -1636,7 +1740,7 @@ static int do_check(struct verifier_env *env) } if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(regs, insn); + err = check_alu_op(env, insn); if (err) return err; @@ -1794,6 +1898,11 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_pointer_value(env, BPF_REG_0)) { + verbose("R0 leaks addr as return value\n"); + return -EACCES; + } + process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { @@ -1880,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) } f = fdget(insn->imm); - - map = bpf_map_get(f); + map = __bpf_map_get(f); if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); @@ -2002,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env) cnt = env->prog->aux->ops-> convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf); + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; @@ -2122,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = do_check(env); skip_full_check: diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f89d9292eee6..b9d0cce3f9ce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,7 +45,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/rwsem.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/sort.h> @@ -76,7 +75,7 @@ * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * - * css_set_rwsem protects task->cgroups pointer, the list of css_set + * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in @@ -84,12 +83,12 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -DECLARE_RWSEM(css_set_rwsem); +DEFINE_SPINLOCK(css_set_lock); EXPORT_SYMBOL_GPL(cgroup_mutex); -EXPORT_SYMBOL_GPL(css_set_rwsem); +EXPORT_SYMBOL_GPL(css_set_lock); #else static DEFINE_MUTEX(cgroup_mutex); -static DECLARE_RWSEM(css_set_rwsem); +static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -107,8 +106,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock); struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_mutex), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* @@ -139,12 +138,34 @@ static const char *cgroup_subsys_name[] = { }; #undef SUBSYS +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ +#define SUBSYS(_x) \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ + DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ + EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); +#include <linux/cgroup_subsys.h> +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, +static struct static_key_true *cgroup_subsys_enabled_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, +static struct static_key_true *cgroup_subsys_on_dfl_key[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ struct cgroup_root cgrp_dfl_root; +EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the @@ -152,12 +173,6 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; -/* - * Set by the boot param of the same name and makes subsystems with NULL - * ->dfl_files to use ->legacy_files on the default hierarchy. - */ -static bool cgroup_legacy_files_on_dfl; - /* some controllers are not supported in the default hierarchy */ static unsigned long cgrp_dfl_root_inhibit_ss_mask; @@ -185,20 +200,97 @@ static u64 css_serial_nr_next = 1; */ static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; +static unsigned long have_free_callback __read_mostly; + +/* Ditto for the can_fork callback. */ +static unsigned long have_canfork_callback __read_mostly; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); +static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID + * @ssid: subsys ID of interest + * + * cgroup_subsys_enabled() can only be used with literal subsys names which + * is fine for individual subsystems but unsuitable for cgroup core. This + * is slower static_key_enabled() based test indexed by @ssid. + */ +static bool cgroup_ssid_enabled(int ssid) +{ + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); +} + +/** + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy + * @cgrp: the cgroup of interest + * + * The default hierarchy is the v2 interface of cgroup and this function + * can be used to test whether a cgroup is on the default hierarchy for + * cases where a subsystem should behave differnetly depending on the + * interface version. + * + * The set of behaviors which change on the default hierarchy are still + * being determined and the mount option is prefixed with __DEVEL__. + * + * List of changed behaviors: + * + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" + * and "name" are disallowed. + * + * - When mounting an existing superblock, mount options should match. + * + * - Remount is disallowed. + * + * - rename(2) is disallowed. + * + * - "tasks" is removed. Everything should be at process granularity. Use + * "cgroup.procs" instead. + * + * - "cgroup.procs" is not sorted. pids will be unique unless they got + * recycled inbetween reads. + * + * - "release_agent" and "notify_on_release" are removed. Replacement + * notification mechanism will be implemented. + * + * - "cgroup.clone_children" is removed. + * + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup + * and its descendants contain no task; otherwise, 1. The file also + * generates kernfs notification which can be monitored through poll and + * [di]notify when the value of the file changes. + * + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and + * take masks of ancestors with non-empty cpus/mems, instead of being + * moved to an ancestor. + * + * - cpuset: a task can be moved into an empty cpuset, and again it takes + * masks of ancestors. + * + * - memcg: use_hierarchy is on by default and the cgroup file for the flag + * is not created. + * + * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. + */ +static bool cgroup_on_dfl(const struct cgroup *cgrp) +{ + return cgrp->root == &cgrp_dfl_root; +} + /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) @@ -207,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; @@ -331,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return !(cgrp->self.flags & CSS_ONLINE); } +static void cgroup_get(struct cgroup *cgrp) +{ + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); +} + +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + +static void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -480,19 +588,31 @@ struct css_set init_css_set = { .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ /** + * css_set_populated - does a css_set contain any tasks? + * @cset: target css_set + */ +static bool css_set_populated(struct css_set *cset) +{ + lockdep_assert_held(&css_set_lock); + + return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); +} + +/** * cgroup_update_populated - updated populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * - * @cgrp is either getting the first task (css_set) or losing the last. - * Update @cgrp->populated_cnt accordingly. The count is propagated - * towards root so that a given cgroup's populated_cnt is zero iff the - * cgroup and all its descendants are empty. + * One of the css_sets associated with @cgrp is either getting its first + * task or losing the last. Update @cgrp->populated_cnt accordingly. The + * count is propagated towards root so that a given cgroup's populated_cnt + * is zero iff the cgroup and all its descendants don't contain any tasks. * * @cgrp's interface file "cgroup.populated" is zero if * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt @@ -502,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */ */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); do { bool trigger; @@ -515,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - if (cgrp->populated_kn) - kernfs_notify(cgrp->populated_kn); + check_for_release(cgrp); + cgroup_file_notify(&cgrp->events_file); + cgrp = cgroup_parent(cgrp); } while (cgrp); } +/** + * css_set_update_populated - update populated state of a css_set + * @cset: target css_set + * @populated: whether @cset is populated or depopulated + * + * @cset is either getting the first task or losing the last. Update the + * ->populated_cnt of all associated cgroups accordingly. + */ +static void css_set_update_populated(struct css_set *cset, bool populated) +{ + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + cgroup_update_populated(link->cgrp, populated); +} + +/** + * css_set_move_task - move a task from one css_set to another + * @task: task being moved + * @from_cset: css_set @task currently belongs to (may be NULL) + * @to_cset: new css_set @task is being moved to (may be NULL) + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks + * + * Move @task from @from_cset to @to_cset. If @task didn't belong to any + * css_set, @from_cset can be NULL. If @task is being disassociated + * instead of moved, @to_cset can be NULL. + * + * This function automatically handles populated_cnt updates and + * css_task_iter adjustments but the caller is responsible for managing + * @from_cset and @to_cset's reference counts. + */ +static void css_set_move_task(struct task_struct *task, + struct css_set *from_cset, struct css_set *to_cset, + bool use_mg_tasks) +{ + lockdep_assert_held(&css_set_lock); + + if (from_cset) { + struct css_task_iter *it, *pos; + + WARN_ON_ONCE(list_empty(&task->cg_list)); + + /* + * @task is leaving, advance task iterators which are + * pointing to it so that they can resume at the next + * position. Advancing an iterator might remove it from + * the list, use safe walk. See css_task_iter_advance*() + * for details. + */ + list_for_each_entry_safe(it, pos, &from_cset->task_iters, + iters_node) + if (it->task_pos == &task->cg_list) + css_task_iter_advance(it); + + list_del_init(&task->cg_list); + if (!css_set_populated(from_cset)) + css_set_update_populated(from_cset, false); + } else { + WARN_ON_ONCE(!list_empty(&task->cg_list)); + } + + if (to_cset) { + /* + * We are synchronized through cgroup_threadgroup_rwsem + * against PF_EXITING setting such that we can't race + * against cgroup_exit() changing the css_set to + * init_css_set and dropping the old one. + */ + WARN_ON_ONCE(task->flags & PF_EXITING); + + if (!css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + rcu_assign_pointer(task->cgroups, to_cset); + list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : + &to_cset->tasks); + } +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -548,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset) struct cgroup_subsys *ss; int ssid; - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (!atomic_dec_and_test(&cset->refcount)) return; @@ -560,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset) css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { - struct cgroup *cgrp = link->cgrp; - list_del(&link->cset_link); list_del(&link->cgrp_link); - - /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links)) { - cgroup_update_populated(cgrp, false); - check_for_release(cgrp); - } - + if (cgroup_parent(link->cgrp)) + cgroup_put(link->cgrp); kfree(link); } @@ -587,9 +781,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); put_css_set_locked(cset); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -778,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link->cset = cset; link->cgrp = cgrp; - if (list_empty(&cgrp->cset_links)) - cgroup_update_populated(cgrp, true); - list_move(&link->cset_link, &cgrp->cset_links); - /* - * Always add links to the tail of the list so that the list - * is sorted by order of hierarchy creation + * Always add links to the tail of the lists so that the lists are + * in choronological order. */ + list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); + + if (cgroup_parent(cgrp)) + cgroup_get(cgrp); } /** @@ -812,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); if (cset) return cset; @@ -837,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->mg_preload_node); INIT_LIST_HEAD(&cset->mg_node); + INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -865,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, list_add_tail(&cset->e_cset_node[ssid], &cset->subsys[ssid]->cgroup->e_csets[ssid]); - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return cset; } @@ -929,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } - up_write(&css_set_rwsem); + + spin_unlock_bh(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -958,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup *res = NULL; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { res = &root->cgrp; @@ -981,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_rwsem held. + * called with cgroup_mutex and css_set_lock held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1020,17 +1216,19 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { + struct cgroup_subsys *ss = cft->ss; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", - cft->ss->name, cft->name); + cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, + cft->name); else strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; @@ -1040,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander + * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; - if (cft->mode) - return cft->mode; - if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write) - mode |= S_IWUSR; + if (cft->write_u64 || cft->write_s64 || cft->write) { + if (cft->flags & CFTYPE_WORLD_WRITABLE) + mode |= S_IWUGO; + else + mode |= S_IWUSR; + } return mode; } -static void cgroup_get(struct cgroup *cgrp) -{ - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); -} - -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - /** * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup @@ -1217,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - remove subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be removed + * css_clear_dir - remove subsys files in a cgroup directory + * @css: taget css + * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void css_clear_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) { - struct cgroup_subsys *ss; - int i; + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts; - for_each_subsys(ss, i) { - struct cftype *cfts; + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); +} - if (!(subsys_mask & (1 << i))) - continue; - list_for_each_entry(cfts, &ss->cfts, node) - cgroup_addrm_files(cgrp, cfts, false); +/** + * css_populate_dir - create subsys files in a cgroup directory + * @css: target css + * @cgrp_overried: specify if target cgroup is different from css->cgroup + * + * On failure, no file is added. + */ +static int css_populate_dir(struct cgroup_subsys_state *css, + struct cgroup *cgrp_override) +{ + struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cftype *cfts, *failed_cfts; + int ret; + + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_dfl_base_files; + else + cfts = cgroup_legacy_base_files; + + return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + } + + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } + return 0; +err: + list_for_each_entry(cfts, &css->ss->cfts, node) { + if (cfts == failed_cfts) + break; + cgroup_addrm_files(css, cgrp, cfts, false); + } + return ret; } static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { + struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; unsigned long tmp_ss_mask; int ssid, i, ret; @@ -1260,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); - if (ret) { - if (dst_root != &cgrp_dfl_root) - return ret; + for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + struct cgroup *scgrp = &ss->root->cgrp; + int tssid; + + ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); + if (!ret) + continue; /* * Rebinding back to the default root is not allowed to @@ -1271,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * be rare. Moving subsystems back and forth even more so. * Just warn about it and continue. */ - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + if (dst_root == &cgrp_dfl_root) { + if (cgrp_dfl_root_visible) { + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + } + continue; + } + + for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + if (tssid == ssid) + break; + css_clear_dir(cgroup_css(scgrp, ss), dcgrp); } + return ret; } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys_which(ss, ssid, &ss_mask) - cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - for_each_subsys_which(ss, ssid, &ss_mask) { - struct cgroup_root *src_root; - struct cgroup_subsys_state *css; + struct cgroup_root *src_root = ss->root; + struct cgroup *scgrp = &src_root->cgrp; + struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; - src_root = ss->root; - css = cgroup_css(&src_root->cgrp, ss); + WARN_ON(!css || cgroup_css(dcgrp, ss)); - WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); + css_clear_dir(css, NULL); - RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); - rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); + rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; - css->cgroup = &dst_root->cgrp; + css->cgroup = dcgrp; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], - &dst_root->cgrp.e_csets[ss->id]); - up_write(&css_set_rwsem); + &dcgrp->e_csets[ss->id]); + spin_unlock_bh(&css_set_lock); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(&src_root->cgrp); + scgrp->subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) { - dst_root->cgrp.subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + if (dst_root == &cgrp_dfl_root) { + static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); + } else { + dcgrp->subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(dcgrp); + static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } if (ss->bind) ss->bind(css); } - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dcgrp->kn); return 0; } @@ -1332,9 +1561,10 @@ static int cgroup_show_options(struct seq_file *seq, struct cgroup_subsys *ss; int ssid; - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_printf(seq, ",%s", ss->name); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1342,13 +1572,14 @@ static int cgroup_show_options(struct seq_file *seq, spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); return 0; } @@ -1447,9 +1678,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) } for_each_subsys(ss, i) { - if (strcmp(token, ss->name)) + if (strcmp(token, ss->legacy_name)) continue; - if (ss->disabled) + if (!cgroup_ssid_enabled(i)) continue; /* Mutually exclusive option 'all' + subsystem name */ @@ -1480,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (!ss->disabled) + if (cgroup_ssid_enabled(i)) opts->subsys_mask |= (1 << i); /* @@ -1576,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1606,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void) if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); - list_add(&p->cg_list, &cset->tasks); + if (!css_set_populated(cset)) + css_set_update_populated(cset, true); + list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } spin_unlock_irq(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1623,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); + INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1660,13 +1894,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; - struct cftype *base_files; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -1677,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) goto out; /* - * We're accessing css_set_count without locking css_set_rwsem here, + * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. The worst that can happen is that we * have some link structures left over @@ -1699,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - if (root == &cgrp_dfl_root) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(root_cgrp, base_files, true); + ret = css_populate_dir(&root_cgrp->self, NULL); if (ret) goto destroy_root; @@ -1724,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - down_write(&css_set_rwsem); - hash_for_each(css_set_table, i, cset, hlist) + spin_lock_bh(&css_set_lock); + hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); - up_write(&css_set_rwsem); + if (css_set_populated(cset)) + cgroup_update_populated(root_cgrp, true); + } + spin_unlock_bh(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -1960,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -1973,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2001,6 +2232,49 @@ struct cgroup_taskset { struct task_struct *cur_task; }; +#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +/** + * cgroup_taskset_add - try to add a migration target task to a taskset + * @task: target task + * @tset: target taskset + * + * Add @task, which is a migration target, to @tset. This function becomes + * noop if @task doesn't need to be migrated. @task's css_set should have + * been added as a migration source and @task->cg_list will be moved from + * the css_set's tasks list to mg_tasks one. + */ +static void cgroup_taskset_add(struct task_struct *task, + struct cgroup_taskset *tset) +{ + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + return; + + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) + return; + + cset = task_css_set(task); + if (!cset->mg_src_cgrp) + return; + + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset->src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset->dst_csets); +} + /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest @@ -2048,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) } /** - * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp: the cgroup @tsk is being migrated from - * @tsk: the task being migrated - * @new_cset: the new css_set @tsk is being attached to + * cgroup_taskset_migrate - migrate a taskset to a cgroup + * @tset: taget taskset + * @dst_cgrp: destination cgroup * - * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. + * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the + * ->can_attach callbacks fails and guarantees that either all or none of + * the tasks in @tset are migrated. @tset is consumed regardless of + * success. */ -static void cgroup_task_migrate(struct cgroup *old_cgrp, - struct task_struct *tsk, - struct css_set *new_cset) +static int cgroup_taskset_migrate(struct cgroup_taskset *tset, + struct cgroup *dst_cgrp) { - struct css_set *old_cset; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + struct cgroup_subsys_state *css, *failed_css = NULL; + struct task_struct *task, *tmp_task; + struct css_set *cset, *tmp_cset; + int i, ret; - /* - * We are synchronized through cgroup_threadgroup_rwsem against - * PF_EXITING setting such that we can't race against cgroup_exit() - * changing the css_set to init_css_set and dropping the old one. - */ - WARN_ON_ONCE(tsk->flags & PF_EXITING); - old_cset = task_css_set(tsk); + /* methods shouldn't be called if no task is actually migrating */ + if (list_empty(&tset->src_csets)) + return 0; - get_css_set(new_cset); - rcu_assign_pointer(tsk->cgroups, new_cset); + /* check that we can legitimately attach to the cgroup */ + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->can_attach) { + ret = css->ss->can_attach(css, tset); + if (ret) { + failed_css = css; + goto out_cancel_attach; + } + } + } /* - * Use move_tail so that cgroup_taskset_first() still returns the - * leader after migration. This works because cgroup_migrate() - * ensures that the dst_cset of the leader is the first on the - * tset's dst_csets list. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ - list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); + spin_lock_bh(&css_set_lock); + list_for_each_entry(cset, &tset->src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { + struct css_set *from_cset = task_css_set(task); + struct css_set *to_cset = cset->mg_dst_cset; + + get_css_set(to_cset); + css_set_move_task(task, from_cset, to_cset, true); + put_css_set_locked(from_cset); + } + } + spin_unlock_bh(&css_set_lock); /* - * We just gained a reference on old_cset by taking it from the - * task. As trading it for new_cset is protected by cgroup_mutex, - * we're safe to drop it here; it will be freed under RCU. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ - put_css_set_locked(old_cset); + tset->csets = &tset->dst_csets; + + for_each_e_css(css, i, dst_cgrp) + if (css->ss->attach) + css->ss->attach(css, tset); + + ret = 0; + goto out_release_tset; + +out_cancel_attach: + for_each_e_css(css, i, dst_cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, tset); + } +out_release_tset: + spin_lock_bh(&css_set_lock); + list_splice_init(&tset->dst_csets, &tset->src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + spin_unlock_bh(&css_set_lock); + return ret; } /** @@ -2104,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /** @@ -2137,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&css_set_rwsem); + lockdep_assert_held(&css_set_lock); src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2226,9 +2539,9 @@ err: /** * cgroup_migrate - migrate a process or task to a cgroup - * @cgrp: the destination cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task + * @cgrp: the destination cgroup * * Migrate a process or task denoted by @leader to @cgrp. If migrating a * process, the caller must be holding cgroup_threadgroup_rwsem. The @@ -2242,115 +2555,29 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) -{ - struct cgroup_taskset tset = { - .src_csets = LIST_HEAD_INIT(tset.src_csets), - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), - .csets = &tset.src_csets, - }; - struct cgroup_subsys_state *css, *failed_css = NULL; - struct css_set *cset, *tmp_cset; - struct task_struct *task, *tmp_task; - int i, ret; +static int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup *cgrp) +{ + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + struct task_struct *task; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { - /* @task either already exited or can't exit until the end */ - if (task->flags & PF_EXITING) - goto next; - - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - goto next; - - cset = task_css_set(task); - if (!cset->mg_src_cgrp) - goto next; - - /* - * cgroup_taskset_first() must always return the leader. - * Take care to avoid disturbing the ordering. - */ - list_move_tail(&task->cg_list, &cset->mg_tasks); - if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset.src_csets); - if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset.dst_csets); - next: + cgroup_taskset_add(task, &tset); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); - up_write(&css_set_rwsem); - - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset.src_csets)) - return 0; - - /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, cgrp) { - if (css->ss->can_attach) { - ret = css->ss->can_attach(css, &tset); - if (ret) { - failed_css = css; - goto out_cancel_attach; - } - } - } - - /* - * Now that we're guaranteed success, proceed to move all tasks to - * the new cgroup. There are no failure cases after here, so this - * is the commit point. - */ - down_write(&css_set_rwsem); - list_for_each_entry(cset, &tset.src_csets, mg_node) { - list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) - cgroup_task_migrate(cset->mg_src_cgrp, task, - cset->mg_dst_cset); - } - up_write(&css_set_rwsem); - - /* - * Migration is committed, all target tasks are now on dst_csets. - * Nothing is sensitive to fork() after this point. Notify - * controllers that migration is complete. - */ - tset.csets = &tset.dst_csets; - - for_each_e_css(css, i, cgrp) - if (css->ss->attach) - css->ss->attach(css, &tset); - - ret = 0; - goto out_release_tset; + spin_unlock_bh(&css_set_lock); -out_cancel_attach: - for_each_e_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } -out_release_tset: - down_write(&css_set_rwsem); - list_splice_init(&tset.dst_csets, &tset.src_csets); - list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { - list_splice_tail_init(&cset->mg_tasks, &cset->tasks); - list_del_init(&cset->mg_node); - } - up_write(&css_set_rwsem); - return ret; + return cgroup_taskset_migrate(&tset, cgrp); } /** @@ -2369,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, int ret; /* look up all src csets */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2379,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2412,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_kn); + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); if (inode) { ret = inode_permission(inode, MAY_WRITE); iput(inode); @@ -2511,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2628,6 +2855,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); + struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct cgroup_subsys_state *css; struct css_set *src_cset; int ret; @@ -2637,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { struct cgrp_cset_link *link; @@ -2649,57 +2877,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, cgrp, &preloaded_csets); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); if (ret) goto out_finish; + spin_lock_bh(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { - struct task_struct *last_task = NULL, *task; + struct task_struct *task, *ntask; /* src_csets precede dst_csets, break on the first dst_cset */ if (!src_cset->mg_src_cgrp) break; - /* - * All tasks in src_cset need to be migrated to the - * matching dst_cset. Empty it process by process. We - * walk tasks but migrate processes. The leader might even - * belong to a different cset but such src_cset would also - * be among the target src_csets because the default - * hierarchy enforces per-process membership. - */ - while (true) { - down_read(&css_set_rwsem); - task = list_first_entry_or_null(&src_cset->tasks, - struct task_struct, cg_list); - if (task) { - task = task->group_leader; - WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); - get_task_struct(task); - } - up_read(&css_set_rwsem); - - if (!task) - break; - - /* guard against possible infinite loop */ - if (WARN(last_task == task, - "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) - goto out_finish; - last_task = task; - - ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - - put_task_struct(task); - - if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) - goto out_finish; - } + /* all tasks in src_csets need to be migrated */ + list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) + cgroup_taskset_add(task, &tset); } + spin_unlock_bh(&css_set_lock); + ret = cgroup_taskset_migrate(&tset, cgrp); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); @@ -2729,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (!cgroup_ssid_enabled(ssid) || + strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { @@ -2853,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = create_css(child, ss, cgrp->subtree_control & (1 << ssid)); else - ret = cgroup_populate_dir(child, 1 << ssid); + ret = css_populate_dir(cgroup_css(child, ss), + NULL); if (ret) goto err_undo_css; } @@ -2886,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (css_disable & (1 << ssid)) { kill_css(css); } else { - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); if (ss->css_reset) ss->css_reset(css); } @@ -2934,15 +3135,16 @@ err_undo_css: if (css_enable & (1 << ssid)) kill_css(css); else - cgroup_clear_dir(child, 1 << ssid); + css_clear_dir(css, NULL); } } goto out_unlock; } -static int cgroup_populated_show(struct seq_file *seq, void *v) +static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + seq_printf(seq, "populated %d\n", + cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } @@ -3085,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, + struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; @@ -3107,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->write == cgroup_procs_write) - cgrp->procs_kn = kn; - else if (cft->seq_show == cgroup_populated_show) - cgrp->populated_kn = kn; + if (cft->file_offset) { + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + kernfs_get(kn); + cfile->kn = kn; + list_add(&cfile->node, &css->files); + } + return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory - * @cgrp: the target cgroup + * @css: the target css + * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * For removals, this function never fails. If addition fails, this - * function doesn't remove files already added. The caller is responsible - * for cleaning up. + * For removals, this function never fails. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], +static int cgroup_addrm_files(struct cgroup_subsys_state *css, + struct cgroup *cgrp, struct cftype cfts[], bool is_add) { - struct cftype *cft; + struct cftype *cft, *cft_end = NULL; int ret; lockdep_assert_held(&cgroup_mutex); - for (cft = cfts; cft->name[0] != '\0'; cft++) { +restart: + for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; @@ -3145,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if (is_add) { - ret = cgroup_add_file(cgrp, cft); + ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); - return ret; + cft_end = cft; + is_add = false; + goto restart; } } else { cgroup_rm_file(cgrp, cft); @@ -3175,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - ret = cgroup_addrm_files(cgrp, cfts, is_add); + ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } @@ -3287,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; - if (ss->disabled) + if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') @@ -3337,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - /* - * If legacy_flies_on_dfl, we want to show the legacy files on the - * dfl hierarchy but iff the target subsystem hasn't been updated - * for the dfl hierarchy yet. - */ - if (!cgroup_legacy_files_on_dfl || - ss->dfl_cftypes != ss->legacy_cftypes) { - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; - } - + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } @@ -3362,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return count; } @@ -3597,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css) } /** - * css_advance_task_iter - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void css_advance_task_iter(struct css_task_iter *it) +static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; + lockdep_assert_held(&css_set_lock); + /* Advance to the next non-empty css_set */ do { l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; + it->task_pos = NULL; return; } @@ -3623,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it) link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } - } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + } while (!css_set_populated(cset)); it->cset_pos = l; @@ -3634,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it) it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + + /* + * We don't keep css_sets locked across iteration steps and thus + * need to take steps to ensure that iteration can be resumed after + * the lock is re-acquired. Iteration is performed at two levels - + * css_sets and tasks in them. + * + * Once created, a css_set never leaves its cgroup lists, so a + * pinned css_set is guaranteed to stay put and we can resume + * iteration afterwards. + * + * Tasks may leave @cset across iteration steps. This is resolved + * by registering each iterator with the css_set currently being + * walked and making css_set_move_task() advance iterators whose + * next task is leaving. + */ + if (it->cur_cset) { + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + } + get_css_set(cset); + it->cur_cset = cset; + list_add(&it->iters_node, &cset->task_iters); +} + +static void css_task_iter_advance(struct css_task_iter *it) +{ + struct list_head *l = it->task_pos; + + lockdep_assert_held(&css_set_lock); + WARN_ON_ONCE(!l); + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ + l = l->next; + + if (l == it->tasks_head) + l = it->mg_tasks_head->next; + + if (l == it->mg_tasks_head) + css_task_iter_advance_css_set(it); + else + it->task_pos = l; } /** @@ -3645,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it) * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. - * - * Note that this function acquires a lock which is released when the - * iteration finishes. The caller can't sleep while iteration is in - * progress. */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_rwsem) { /* no one should try to iterate before mounting cgroups */ WARN_ON_ONCE(!use_task_css_set_links); - down_read(&css_set_rwsem); + memset(it, 0, sizeof(*it)); + + spin_lock_bh(&css_set_lock); it->ss = css->ss; @@ -3668,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css, it->cset_head = it->cset_pos; - css_advance_task_iter(it); + css_task_iter_advance_css_set(it); + + spin_unlock_bh(&css_set_lock); } /** @@ -3681,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { - struct task_struct *res; - struct list_head *l = it->task_pos; - - /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_pos) - return NULL; - res = list_entry(l, struct task_struct, cg_list); + if (it->cur_task) { + put_task_struct(it->cur_task); + it->cur_task = NULL; + } - /* - * Advance iterator to find next entry. cset->tasks is consumed - * first and then ->mg_tasks. After ->mg_tasks, we move onto the - * next cset. - */ - l = l->next; + spin_lock_bh(&css_set_lock); - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (it->task_pos) { + it->cur_task = list_entry(it->task_pos, struct task_struct, + cg_list); + get_task_struct(it->cur_task); + css_task_iter_advance(it); + } - if (l == it->mg_tasks_head) - css_advance_task_iter(it); - else - it->task_pos = l; + spin_unlock_bh(&css_set_lock); - return res; + return it->cur_task; } /** @@ -3714,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_rwsem) { - up_read(&css_set_rwsem); + if (it->cur_cset) { + spin_lock_bh(&css_set_lock); + list_del(&it->iters_node); + put_css_set_locked(it->cur_cset); + spin_unlock_bh(&css_set_lock); + } + + if (it->cur_task) + put_task_struct(it->cur_task); } /** @@ -3741,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); if (ret) @@ -3762,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(to, task, false); + ret = cgroup_migrate(task, false, to); put_task_struct(task); } } while (task && !ret); @@ -4259,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", + .file_offset = offsetof(struct cgroup, procs_file), .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.controllers", @@ -4283,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = { .write = cgroup_subtree_control_write, }, { - .name = "cgroup.populated", + .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cgroup_populated_show, + .file_offset = offsetof(struct cgroup, events_file), + .seq_show = cgroup_events_show, }, { } /* terminate */ }; @@ -4300,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup_procs_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "cgroup.clone_children", @@ -4320,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = { .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup_tasks_write, - .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -4337,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = { { } /* terminate */ }; -/** - * cgroup_populate_dir - create subsys files in a cgroup directory - * @cgrp: target cgroup - * @subsys_mask: mask of the subsystem ids whose files should be added - * - * On failure, no file is added. - */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i, ret = 0; - - /* process cftsets of each subsystem */ - for_each_subsys(ss, i) { - struct cftype *cfts; - - if (!(subsys_mask & (1 << i))) - continue; - - list_for_each_entry(cfts, &ss->cfts, node) { - ret = cgroup_addrm_files(cgrp, cfts, true); - if (ret < 0) - goto err; - } - } - return 0; -err: - cgroup_clear_dir(cgrp, subsys_mask); - return ret; -} - /* * css destruction is four-stage process. * @@ -4396,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); + list_for_each_entry(cfile, &css->files, node) + kernfs_put(cfile->kn); + if (ss) { /* css free path */ int id = css->id; @@ -4503,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { @@ -4579,13 +4801,13 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, if (err) goto err_free_css; - err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_percpu_ref; css->id = err; if (visible) { - err = cgroup_populate_dir(cgrp, 1 << ss->id); + err = css_populate_dir(css, NULL); if (err) goto err_free_id; } @@ -4611,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err_list_del: list_del_rcu(&css->sibling); - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: @@ -4628,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - struct cftype *base_files; int ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. @@ -4656,7 +4877,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; goto out_cancel_ref; @@ -4704,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - if (cgroup_on_dfl(cgrp)) - base_files = cgroup_dfl_base_files; - else - base_files = cgroup_legacy_base_files; - - ret = cgroup_addrm_files(cgrp, base_files, true); + ret = css_populate_dir(&cgrp->self, NULL); if (ret) goto out_destroy; @@ -4796,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - cgroup_clear_dir(css->cgroup, 1 << css->ss->id); + css_clear_dir(css, NULL); /* * Killing would put the base ref, but we need to keep it alive @@ -4845,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; - bool empty; int ssid; lockdep_assert_held(&cgroup_mutex); /* - * css_set_rwsem synchronizes access to ->cset_links and prevents - * @cgrp from being removed while put_css_set() is in progress. + * Only migration can raise populated from zero and we're already + * holding cgroup_mutex. */ - down_read(&css_set_rwsem); - empty = list_empty(&cgrp->cset_links); - up_read(&css_set_rwsem); - if (!empty) + if (cgroup_is_populated(cgrp)) return -EBUSY; /* @@ -4955,6 +5167,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; + have_free_callback |= (bool)ss->free << ss->id; + have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -4993,6 +5207,8 @@ int __init cgroup_init_early(void) ss->id = i; ss->name = cgroup_subsys_name[i]; + if (!ss->legacy_name) + ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); @@ -5000,6 +5216,8 @@ int __init cgroup_init_early(void) return 0; } +static unsigned long cgroup_disable_mask __initdata; + /** * cgroup_init - cgroup initialization * @@ -5010,7 +5228,7 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int ssid, err; + int ssid; BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); @@ -5046,14 +5264,15 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (ss->disabled) + if (cgroup_disable_mask & (1 << ssid)) { + static_branch_disable(cgroup_subsys_enabled_key[ssid]); + printk(KERN_INFO "Disabling %s control group subsystem\n", + ss->name); continue; + } cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) - ss->dfl_cftypes = ss->legacy_cftypes; - if (!ss->dfl_cftypes) cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; @@ -5068,17 +5287,10 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } - err = sysfs_create_mount_point(fs_kobj, "cgroup"); - if (err) - return err; - - err = register_filesystem(&cgroup_fs_type); - if (err < 0) { - sysfs_remove_mount_point(fs_kobj, "cgroup"); - return err; - } + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); + WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); - proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); return 0; } @@ -5125,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5136,26 +5348,48 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, continue; seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (root != &cgrp_dfl_root) + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", + ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); + cgrp = task_cgroup_from_root(tsk, root); - path = cgroup_path(cgrp, buf, PATH_MAX); - if (!path) { - retval = -ENAMETOOLONG; - goto out_unlock; + + /* + * On traditional hierarchies, all zombie tasks show up as + * belonging to the root cgroup. On the default hierarchy, + * while a zombie doesn't show up in "cgroup.procs" and + * thus can't be migrated, its /proc/PID/cgroup keeps + * reporting the cgroup it belonged to before exiting. If + * the cgroup is removed before the zombie is reaped, + * " (deleted)" is appended to the cgroup path. + */ + if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; + goto out_unlock; + } + } else { + path = "/"; } + seq_puts(m, path); - seq_putc(m, '\n'); + + if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) + seq_puts(m, " (deleted)\n"); + else + seq_putc(m, '\n'); } retval = 0; out_unlock: - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5178,8 +5412,9 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), !ss->disabled); + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); mutex_unlock(&cgroup_mutex); return 0; @@ -5197,6 +5432,19 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; +static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) + return &ss_priv[i - CGROUP_CANFORK_START]; + return NULL; +} + +static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) +{ + void **private = subsys_canfork_priv_p(ss_priv, i); + return private ? *private : NULL; +} + /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5212,6 +5460,57 @@ void cgroup_fork(struct task_struct *child) } /** + * cgroup_can_fork - called on a new task before the process is exposed + * @child: the task in question. + * + * This calls the subsystem can_fork() callbacks. If the can_fork() callback + * returns an error, the fork aborts with that error code. This allows for + * a cgroup subsystem to conditionally allow or deny new forks. + */ +int cgroup_can_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i, j, ret; + + for_each_subsys_which(ss, i, &have_canfork_callback) { + ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + if (ret) + goto out_revert; + } + + return 0; + +out_revert: + for_each_subsys(ss, j) { + if (j >= i) + break; + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + } + + return ret; +} + +/** + * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() + * @child: the task in question + * + * This calls the cancel_fork() callbacks if a fork failed *after* + * cgroup_can_fork() succeded. + */ +void cgroup_cancel_fork(struct task_struct *child, + void *ss_priv[CGROUP_CANFORK_COUNT]) +{ + struct cgroup_subsys *ss; + int i; + + for_each_subsys(ss, i) + if (ss->cancel_fork) + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); +} + +/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * @@ -5221,7 +5520,8 @@ void cgroup_fork(struct task_struct *child) * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, + void *old_ss_priv[CGROUP_CANFORK_COUNT]) { struct cgroup_subsys *ss; int i; @@ -5235,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child) * @child during its iteration. * * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_rwsem guarantees both that the + * css_set. Grabbing css_set_lock guarantees both that the * association is stable, and, on completion of the parent's * migration, @child is visible in the source of migration or * already in the destination cgroup. This guarantee is necessary @@ -5250,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - down_write(&css_set_rwsem); + spin_lock_bh(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { - rcu_assign_pointer(child->cgroups, cset); - list_add(&child->cg_list, &cset->tasks); get_css_set(cset); + css_set_move_task(child, NULL, cset, false); } - up_write(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); } /* @@ -5266,7 +5565,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child); + ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); } /** @@ -5292,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; - bool put_cset = false; int i; /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check cg_list without grabbing css_set_rwsem. + * with us, we can check css_set and cg_list without synchronization. */ + cset = task_css_set(tsk); + if (!list_empty(&tsk->cg_list)) { - down_write(&css_set_rwsem); - list_del_init(&tsk->cg_list); - up_write(&css_set_rwsem); - put_cset = true; + spin_lock_bh(&css_set_lock); + css_set_move_task(tsk, cset, NULL, false); + spin_unlock_bh(&css_set_lock); + } else { + get_css_set(cset); } - /* Reassign the task to the init_css_set. */ - cset = task_css_set(tsk); - RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) { - struct cgroup_subsys_state *old_css = cset->subsys[i]; - struct cgroup_subsys_state *css = task_css(tsk, i); + for_each_subsys_which(ss, i, &have_exit_callback) + ss->exit(tsk); +} - ss->exit(css, old_css, tsk); - } +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys_which(ss, ssid, &have_free_callback) + ss->free(task); - if (put_cset) - put_css_set(cset); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) { - if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } @@ -5400,26 +5702,16 @@ static int __init cgroup_disable(char *str) continue; for_each_subsys(ss, i) { - if (!strcmp(token, ss->name)) { - ss->disabled = 1; - printk(KERN_INFO "Disabling %s control group" - " subsystem\n", ss->name); - break; - } + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + cgroup_disable_mask |= 1 << i; } } return 1; } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_set_legacy_files_on_dfl(char *str) -{ - printk("cgroup: using legacy files on the default hierarchy\n"); - cgroup_legacy_files_on_dfl = true; - return 0; -} -__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5523,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -5534,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); kfree(name_buf); return 0; } @@ -5545,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - down_read(&css_set_rwsem); + spin_lock_bh(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5568,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - up_read(&css_set_rwsem); + spin_unlock_bh(&css_set_lock); return 0; } static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return (!cgroup_has_tasks(css->cgroup) && + return (!cgroup_is_populated(css->cgroup) && !css_has_online_children(&css->cgroup->self)); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 92b98cc0ee76..f1b30ad5dc6d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task) +static void freezer_fork(struct task_struct *task, void *private) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..cdd8df4e991c --- /dev/null +++ b/kernel/cgroup_pids.c @@ -0,0 +1,353 @@ +/* + * Process number limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop any new processes from fork()ing + * after a certain limit is reached. + * + * Since it is trivial to hit the task limit without hitting any kmemcg limits + * in place, PIDs are a fundamental resource. As such, PID exhaustion must be + * preventable in the scope of a cgroup hierarchy by allowing resource limiting + * of the number of tasks in a cgroup. + * + * In order to use the `pids` controller, set the maximum number of tasks in + * pids.max (this is not available in the root cgroup for obvious reasons). The + * number of processes currently in the cgroup is given by pids.current. + * Organisational operations are not blocked by cgroup policies, so it is + * possible to have pids.current > pids.max. However, it is not possible to + * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking + * would cause a cgroup policy to be violated. + * + * To set a cgroup to have no limit, set pids.max to "max". This is the default + * for all new cgroups (N.B. that PID limits are hierarchical, so the most + * stringent limit in the hierarchy is followed). + * + * pids.current tracks all child cgroup hierarchies, so parent/pids.current is + * a superset of parent/child/pids.current. + * + * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/atomic.h> +#include <linux/cgroup.h> +#include <linux/slab.h> + +#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) +#define PIDS_MAX_STR "max" + +struct pids_cgroup { + struct cgroup_subsys_state css; + + /* + * Use 64-bit types so that we can safely represent "max" as + * %PIDS_MAX = (%PID_MAX_LIMIT + 1). + */ + atomic64_t counter; + int64_t limit; +}; + +static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) +{ + return container_of(css, struct pids_cgroup, css); +} + +static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) +{ + return css_pids(pids->css.parent); +} + +static struct cgroup_subsys_state * +pids_css_alloc(struct cgroup_subsys_state *parent) +{ + struct pids_cgroup *pids; + + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); + if (!pids) + return ERR_PTR(-ENOMEM); + + pids->limit = PIDS_MAX; + atomic64_set(&pids->counter, 0); + return &pids->css; +} + +static void pids_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_pids(css)); +} + +/** + * pids_cancel - uncharge the local pid count + * @pids: the pid cgroup state + * @num: the number of pids to cancel + * + * This function will WARN if the pid count goes under 0, because such a case is + * a bug in the pids controller proper. + */ +static void pids_cancel(struct pids_cgroup *pids, int num) +{ + /* + * A negative count (or overflow for that matter) is invalid, + * and indicates a bug in the `pids` controller proper. + */ + WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); +} + +/** + * pids_uncharge - hierarchically uncharge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to uncharge + */ +static void pids_uncharge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + pids_cancel(p, num); +} + +/** + * pids_charge - hierarchically charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function does *not* follow the pid limit set. It cannot fail and the new + * pid count may exceed the limit. This is only used for reverting failed + * attaches, where there is no other way out than violating the limit. + */ +static void pids_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + atomic64_add(num, &p->counter); +} + +/** + * pids_try_charge - hierarchically try to charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function follows the set limit. It will fail if the charge would cause + * the new value to exceed the hierarchical limit. Returns 0 if the charge + * succeded, otherwise -EAGAIN. + */ +static int pids_try_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p, *q; + + for (p = pids; p; p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + /* + * Since new is capped to the maximum number of pid_t, if + * p->limit is %PIDS_MAX then we know that this test will never + * fail. + */ + if (new > p->limit) + goto revert; + } + + return 0; + +revert: + for (q = pids; q != p; q = parent_pids(q)) + pids_cancel(q, num); + pids_cancel(p, num); + + return -EAGAIN; +} + +static int pids_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + /* + * No need to pin @old_css between here and cancel_attach() + * because cgroup core protects it from being freed before + * the migration completes or fails. + */ + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(pids, 1); + pids_uncharge(old_pids, 1); + } + + return 0; +} + +static void pids_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(old_pids, 1); + pids_uncharge(pids, 1); + } +} + +static int pids_can_fork(struct task_struct *task, void **priv_p) +{ + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + int err; + + /* + * Use the "current" task_css for the pids subsystem as the tentative + * css. It is possible we will charge the wrong hierarchy, in which + * case we will forcefully revert/reapply the charge on the right + * hierarchy after it is committed to the task proper. + */ + css = task_get_css(current, pids_cgrp_id); + pids = css_pids(css); + + err = pids_try_charge(pids, 1); + if (err) + goto err_css_put; + + *priv_p = css; + return 0; + +err_css_put: + css_put(css); + return err; +} + +static void pids_cancel_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css = priv; + struct pids_cgroup *pids = css_pids(css); + + pids_uncharge(pids, 1); + css_put(css); +} + +static void pids_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css; + struct cgroup_subsys_state *old_css = priv; + struct pids_cgroup *pids; + struct pids_cgroup *old_pids = css_pids(old_css); + + css = task_get_css(task, pids_cgrp_id); + pids = css_pids(css); + + /* + * If the association has changed, we have to revert and reapply the + * charge/uncharge on the wrong hierarchy to the current one. Since + * the association can only change due to an organisation event, its + * okay for us to ignore the limit in this case. + */ + if (pids != old_pids) { + pids_uncharge(old_pids, 1); + pids_charge(pids, 1); + } + + css_put(css); + css_put(old_css); +} + +static void pids_free(struct task_struct *task) +{ + struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); + + pids_uncharge(pids, 1); +} + +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct pids_cgroup *pids = css_pids(css); + int64_t limit; + int err; + + buf = strstrip(buf); + if (!strcmp(buf, PIDS_MAX_STR)) { + limit = PIDS_MAX; + goto set_limit; + } + + err = kstrtoll(buf, 0, &limit); + if (err) + return err; + + if (limit < 0 || limit >= PIDS_MAX) + return -EINVAL; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + pids->limit = limit; + return nbytes; +} + +static int pids_max_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct pids_cgroup *pids = css_pids(css); + int64_t limit = pids->limit; + + if (limit >= PIDS_MAX) + seq_printf(sf, "%s\n", PIDS_MAX_STR); + else + seq_printf(sf, "%lld\n", limit); + + return 0; +} + +static s64 pids_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return atomic64_read(&pids->counter); +} + +static struct cftype pids_files[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys pids_cgrp_subsys = { + .css_alloc = pids_css_alloc, + .css_free = pids_css_free, + .can_attach = pids_can_attach, + .cancel_attach = pids_cancel_attach, + .can_fork = pids_can_fork, + .cancel_fork = pids_cancel_fork, + .fork = pids_fork, + .free = pids_free, + .legacy_cftypes = pids_files, + .dfl_cftypes = pids_files, +}; diff --git a/kernel/cpu.c b/kernel/cpu.c index 5644ec5582b9..85ff5e26e23b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -102,19 +102,6 @@ void get_online_cpus(void) } EXPORT_SYMBOL_GPL(get_online_cpus); -bool try_get_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return true; - if (!mutex_trylock(&cpu_hotplug.lock)) - return false; - cpuhp_lock_acquire_tryread(); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); - return true; -} -EXPORT_SYMBOL_GPL(try_get_online_cpus); - void put_online_cpus(void) { int refcount; @@ -191,21 +178,22 @@ void cpu_hotplug_done(void) void cpu_hotplug_disable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; + cpu_hotplug_disabled++; cpu_maps_update_done(); } +EXPORT_SYMBOL_GPL(cpu_hotplug_disable); void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); cpu_maps_update_done(); } - +EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) +int register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -214,7 +202,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } -int __ref __register_cpu_notifier(struct notifier_block *nb) +int __register_cpu_notifier(struct notifier_block *nb) { return raw_notifier_chain_register(&cpu_chain, nb); } @@ -244,7 +232,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); -void __ref unregister_cpu_notifier(struct notifier_block *nb) +void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -252,7 +240,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -void __ref __unregister_cpu_notifier(struct notifier_block *nb) +void __unregister_cpu_notifier(struct notifier_block *nb) { raw_notifier_chain_unregister(&cpu_chain, nb); } @@ -303,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -319,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { @@ -329,7 +317,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) +static int take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -343,12 +331,12 @@ static int __ref take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -381,14 +369,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * will observe it. * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. + * not imply sync_sched(), so wait for both. * * Do sync before park smpboot threads to take care the rcu boost case. */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); smpboot_park_threads(cpu); @@ -401,7 +389,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -442,7 +430,7 @@ out_release: return err; } -int __ref cpu_down(unsigned int cpu) +int cpu_down(unsigned int cpu) { int err; @@ -608,13 +596,18 @@ int disable_nonboot_cpus(void) } } - if (!error) { + if (!error) BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { + else pr_err("Non-boot CPUs are not disabled\n"); - } + + /* + * Make sure the CPUs won't be enabled by someone else. We need to do + * this even in case of failure as all disable_nonboot_cpus() users are + * supposed to do enable_nonboot_cpus() on the failure path. + */ + cpu_hotplug_disabled++; + cpu_maps_update_done(); return error; } @@ -627,13 +620,13 @@ void __weak arch_enable_nonboot_cpus_end(void) { } -void __ref enable_nonboot_cpus(void) +void enable_nonboot_cpus(void) { int cpu, error; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); if (cpumask_empty(frozen_cpus)) goto out; diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 9656a3c36503..009cc9a17d95 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); * low power state that may have caused some blocks in the same power domain * to reset. * - * Must be called after cpu_pm_exit has been called on all cpus in the power + * Must be called after cpu_cluster_pm_enter has been called for the power * domain, and before cpu_pm_exit has been called on any cpu in the power * domain. Notified drivers can include VFP co-processor, interrupt controller * and its PM extensions, local CPU timers context save/restore which diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee14e3a35a29..10ae73611d80 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_cpuset_subset(trial, par)) goto out; /* @@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1223,7 +1226,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ - update_nodemasks_hier(cs, &cs->mems_allowed); + update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } @@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_on_dfl(css->cgroup) && + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; - struct mm_struct *mm; struct task_struct *task; - struct task_struct *leader = cgroup_taskset_first(tset); + struct task_struct *leader; struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; @@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css, } /* - * Change mm, possibly for multiple threads in a threadgroup. This is - * expensive and may sleep. + * Change mm for all threadgroup leaders. This is expensive and may + * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - mm = get_task_mm(leader); - if (mm) { - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - - /* - * old_mems_allowed is the same with mems_allowed here, except - * if this task is being moved automatically due to hotplug. - * In that case @mems_allowed has been updated and is empty, - * so @old_mems_allowed is the right nodesets that we migrate - * mm from. - */ - if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, - &cpuset_attach_nodemask_to); + cgroup_taskset_for_each_leader(leader, tset) { + struct mm_struct *mm = get_task_mm(leader); + + if (mm) { + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); + + /* + * old_mems_allowed is the same with mems_allowed + * here, except if this task is being moved + * automatically due to hotplug. In that case + * @mems_allowed has been updated and is empty, so + * @old_mems_allowed is the right nodesets that we + * migrate mm from. + */ + if (is_memory_migrate(cs)) { + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, + &cpuset_attach_nodemask_to); + } + mmput(mm); } - mmput(mm); } cs->old_mems_allowed = cpuset_attach_nodemask_to; @@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; - case FILE_MEMORY_PRESSURE: - retval = -EACCES; - break; case FILE_SPREAD_PAGE: retval = update_flag(CS_SPREAD_PAGE, cs, val); break; @@ -1863,9 +1866,6 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE, - .mode = S_IRUGO, }, { @@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(cs->css.cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_on_dfl(root_css->cgroup)) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2210,7 +2210,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_on_dfl(cs->css.cgroup)) + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); + bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); mutex_lock(&cpuset_mutex); @@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } /** - * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @tsk: pointer to task_struct of some task. + * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * - * Description: Prints @task's name, cpuset name, and cached copy of its + * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ -void cpuset_print_task_mems_allowed(struct task_struct *tsk) +void cpuset_print_current_mems_allowed(void) { struct cgroup *cgrp; rcu_read_lock(); - cgrp = task_cs(tsk)->css.cgroup; - pr_info("%s cpuset=", tsk->comm); + cgrp = task_cs(current)->css.cgroup; + pr_info("%s cpuset=", current->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); + pr_cont(" mems_allowed=%*pbl\n", + nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); } diff --git a/kernel/cred.c b/kernel/cred.c index ec1c07667ec1..71179a09c1d6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -20,11 +20,16 @@ #include <linux/cn_proc.h> #if 0 -#define kdebug(FMT, ...) \ - printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__) #else -#define kdebug(FMT, ...) \ - no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#define kdebug(FMT, ...) \ +do { \ + if (0) \ + no_printk("[%-5.5s%5u] " FMT "\n", \ + current->comm, current->pid, ##__VA_ARGS__); \ +} while (0) #endif static struct kmem_cache *cred_jar; diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..1a734e0adfa7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -195,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; -void update_perf_cpu_limits(void) +static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; @@ -471,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, * mode SWOUT : schedule out everything * mode SWIN : schedule in based on cgroup for next */ -void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -1242,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; @@ -1262,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; + nr += nr_siblings; size += sizeof(u64); } @@ -1270,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event) event->read_size = size; } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; u16 size = 0; - perf_event__read_size(event); - if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); @@ -1302,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event) event->header_size = size; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ + __perf_event_read_size(event, + event->group_leader->nr_siblings); + __perf_event_header_size(event, event->attr.sample_type); +} + static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; @@ -1329,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +static bool perf_event_validate_size(struct perf_event *event) +{ + /* + * The values computed here will be over-written when we actually + * attach the event. + */ + __perf_event_read_size(event, event->group_leader->nr_siblings + 1); + __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); + perf_event__id_header_size(event); + + /* + * Sum the lot; should not exceed the 64k limit we have on records. + * Conservative limit to allow for callchains and other variable fields. + */ + if (event->read_size + event->header_size + + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + return false; + + return true; +} + static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; @@ -1868,8 +1894,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - event->tstamp_running += tstamp - event->tstamp_stopped; - perf_set_shadow_time(event, ctx, tstamp); perf_log_itrace_start(event); @@ -1881,6 +1905,8 @@ event_sched_in(struct perf_event *event, goto out; } + event->tstamp_running += tstamp - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -1913,7 +1939,7 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - pmu->start_txn(pmu); + pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); @@ -2619,6 +2645,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2641,6 +2670,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2831,6 +2863,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3174,14 +3209,22 @@ void perf_event_exec(void) rcu_read_unlock(); } +struct perf_read_data { + struct perf_event *event; + bool group; + int ret; +}; + /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { - struct perf_event *event = info; + struct perf_read_data *data = info; + struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct pmu *pmu = event->pmu; /* * If this is a task context, we need to check whether it is @@ -3198,9 +3241,35 @@ static void __perf_event_read(void *info) update_context_time(ctx); update_cgrp_time_from_event(event); } + update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + if (event->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!data->group) { + pmu->read(event); + data->ret = 0; + goto unlock; + } + + pmu->start_txn(pmu, PERF_PMU_TXN_READ); + + pmu->read(event); + + list_for_each_entry(sub, &event->sibling_list, group_entry) { + update_event_times(sub); + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + /* + * Use sibling's PMU rather than @event's since + * sibling could be on different (eg: software) PMU. + */ + sub->pmu->read(sub); + } + } + + data->ret = pmu->commit_txn(pmu); + +unlock: raw_spin_unlock(&ctx->lock); } @@ -3212,15 +3281,76 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } -static u64 perf_event_read(struct perf_event *event) +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) +{ + unsigned long flags; + u64 val; + + /* + * Disabling interrupts avoids all counter scheduling (context + * switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()); + + /* + * It must not be an event with inherit set, we cannot read + * all child counters from atomic context. + */ + WARN_ON_ONCE(event->attr.inherit); + + /* + * It must not have a pmu::count method, those are not + * NMI safe. + */ + WARN_ON_ONCE(event->pmu->count); + + /* + * If the event is currently on this CPU, its either a per-task event, + * or local to this CPU. Furthermore it means its ACTIVE (otherwise + * oncpu == -1). + */ + if (event->oncpu == smp_processor_id()) + event->pmu->read(event); + + val = local64_read(&event->count); + local_irq_restore(flags); + + return val; +} + +static int perf_event_read(struct perf_event *event, bool group) { + int ret = 0; + /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data = { + .event = event, + .group = group, + .ret = 0, + }; smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); + __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3235,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event) update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (group) + update_group_times(event); + else + update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return perf_event_count(event); + return ret; } /* @@ -3454,6 +3587,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -3677,7 +3814,7 @@ static void put_event(struct perf_event *event) * see the comment there. * * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while + * perf_read_group(), which takes faults while * holding ctx->mutex, however this is called after * the last filedesc died, so there is no possibility * to trigger the AB-BA case. @@ -3751,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) *running = 0; mutex_lock(&event->child_mutex); - total += perf_event_read(event); + + (void)perf_event_read(event, false); + total += perf_event_count(event); + *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); *running += event->total_time_running + atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); + (void)perf_event_read(child, false); + total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -3768,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) } EXPORT_SYMBOL_GPL(perf_event_read_value); -static int perf_event_read_group(struct perf_event *event, - u64 read_format, char __user *buf) +static int __perf_read_group_add(struct perf_event *leader, + u64 read_format, u64 *values) { - struct perf_event *leader = event->group_leader, *sub; - struct perf_event_context *ctx = leader->ctx; - int n = 0, size = 0, ret; - u64 count, enabled, running; - u64 values[5]; + struct perf_event *sub; + int n = 1; /* skip @nr */ + int ret; - lockdep_assert_held(&ctx->mutex); + ret = perf_event_read(leader, true); + if (ret) + return ret; - count = perf_event_read_value(leader, &enabled, &running); + /* + * Since we co-schedule groups, {enabled,running} times of siblings + * will be identical to those of the leader, so we only publish one + * set. + */ + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] += leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] += leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + /* + * Write {count,id} tuples for every sibling. + */ + values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - size = n * sizeof(u64); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + values[n++] += perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + } - if (copy_to_user(buf, values, size)) - return -EFAULT; + return 0; +} - ret = size; +static int perf_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *child; + struct perf_event_context *ctx = leader->ctx; + int ret; + u64 *values; - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; + lockdep_assert_held(&ctx->mutex); - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); + values = kzalloc(event->read_size, GFP_KERNEL); + if (!values) + return -ENOMEM; - size = n * sizeof(u64); + values[0] = 1 + leader->nr_siblings; - if (copy_to_user(buf + ret, values, size)) { - return -EFAULT; - } + /* + * By locking the child_mutex of the leader we effectively + * lock the child list of all siblings.. XXX explain how. + */ + mutex_lock(&leader->child_mutex); - ret += size; + ret = __perf_read_group_add(leader, read_format, values); + if (ret) + goto unlock; + + list_for_each_entry(child, &leader->child_list, child_list) { + ret = __perf_read_group_add(child, read_format, values); + if (ret) + goto unlock; } + mutex_unlock(&leader->child_mutex); + + ret = event->read_size; + if (copy_to_user(buf, values, event->read_size)) + ret = -EFAULT; + goto out; + +unlock: + mutex_unlock(&leader->child_mutex); +out: + kfree(values); return ret; } -static int perf_event_read_one(struct perf_event *event, +static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; @@ -3854,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event) * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +__perf_read(struct perf_event *event, char __user *buf, size_t count) { u64 read_format = event->attr.read_format; int ret; @@ -3872,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); + ret = perf_read_group(event, read_format, buf); else - ret = perf_event_read_one(event, read_format, buf); + ret = perf_read_one(event, read_format, buf); return ret; } @@ -3887,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ctx = perf_event_ctx_lock(event); - ret = perf_read_hw(event, buf, count); + ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); return ret; @@ -3918,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void _perf_event_reset(struct perf_event *event) { - (void)perf_event_read(event); + (void)perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -3958,28 +4139,21 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0, active; +struct period_event { + struct perf_event *event; u64 value; +}; - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; +static int __perf_event_period(void *info) +{ + struct period_event *pe = info; + struct perf_event *event = pe->event; + struct perf_event_context *ctx = event->ctx; + u64 value = pe->value; + bool active; - raw_spin_lock_irq(&ctx->lock); + raw_spin_lock(&ctx->lock); if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - event->attr.sample_freq = value; } else { event->attr.sample_period = value; @@ -3998,11 +4172,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } + raw_spin_unlock(&ctx->lock); -unlock: + return 0; +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct period_event pe = { .event = event, }; + struct perf_event_context *ctx = event->ctx; + struct task_struct *task; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + if (event->attr.freq && value > sysctl_perf_event_sample_rate) + return -EINVAL; + + task = ctx->task; + pe.value = value; + + if (!task) { + cpu_function_call(event->cpu, __perf_event_period, &pe); + return 0; + } + +retry: + if (!task_function_call(task, __perf_event_period, &pe)) + return 0; + + raw_spin_lock_irq(&ctx->lock); + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + task = ctx->task; + goto retry; + } + + __perf_event_period(&pe); raw_spin_unlock_irq(&ctx->lock); - return ret; + return 0; } static const struct file_operations perf_fops; @@ -4740,12 +4956,20 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); + kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } @@ -5151,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { - perf_output_put(handle, data->raw->size); - __output_copy(handle, data->raw->data, - data->raw->size); + u32 raw_size = data->raw->size; + u32 real_size = round_up(raw_size + sizeof(u32), + sizeof(u64)) - sizeof(u32); + u64 zero = 0; + + perf_output_put(handle, real_size); + __output_copy(handle, data->raw->data, raw_size); + if (real_size - raw_size) + __output_copy(handle, &zero, real_size - raw_size); } else { struct { u32 size; @@ -5285,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header, else size += sizeof(u32); - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; + header->size += round_up(size, sizeof(u64)); } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5982,6 +6211,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) } /* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + +/* * IRQ throttle logging */ @@ -6040,8 +6354,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); @@ -6124,7 +6436,7 @@ static int __perf_event_overflow(struct perf_event *event, else perf_event_output(event, data, regs); - if (event->fasync && event->pending_kill) { + if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } @@ -6749,8 +7061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); @@ -7074,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) { } +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) +{ +} + static int perf_pmu_nop_int(struct pmu *pmu) { return 0; } -static void perf_pmu_start_txn(struct pmu *pmu) +static DEFINE_PER_CPU(unsigned int, nop_txn_flags); + +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) { + __this_cpu_write(nop_txn_flags, flags); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); } static int perf_pmu_commit_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return 0; + perf_pmu_enable(pmu); return 0; } static void perf_pmu_cancel_txn(struct pmu *pmu) { + unsigned int flags = __this_cpu_read(nop_txn_flags); + + __this_cpu_write(nop_txn_flags, 0); + + if (flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_enable(pmu); } @@ -7330,7 +7667,7 @@ got_cpu_context: pmu->commit_txn = perf_pmu_commit_txn; pmu->cancel_txn = perf_pmu_cancel_txn; } else { - pmu->start_txn = perf_pmu_nop_void; + pmu->start_txn = perf_pmu_nop_txn; pmu->commit_txn = perf_pmu_nop_int; pmu->cancel_txn = perf_pmu_nop_void; } @@ -7418,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) return ret; } -struct pmu *perf_init_event(struct perf_event *event) +static struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; @@ -7479,6 +7816,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) @@ -8100,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; + mutex_lock_double(&gctx->mutex, &ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + } + + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_locked; + } + + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) { + /* exclusive and group stuff are assumed mutually exclusive */ + WARN_ON_ONCE(move_group); + + err = -EBUSY; + goto err_locked; + } + WARN_ON_ONCE(ctx->parent_ctx); + + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - mutex_lock_double(&gctx->mutex, &ctx->mutex); - perf_remove_from_context(group_leader, false); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8114,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(sibling, false); put_ctx(gctx); } - } else { - mutex_lock(&ctx->mutex); - } - WARN_ON_ONCE(ctx->parent_ctx); - - if (move_group) { /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. @@ -8152,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - } - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - mutex_unlock(&ctx->mutex); - fput(event_file); - goto err_context; + /* + * Now that all events are installed in @ctx, nothing + * references @gctx anymore, so drop the last reference we have + * on it. + */ + put_ctx(gctx); } + /* + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) { + if (move_group) mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -8179,12 +8543,6 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in @@ -8194,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_locked: + if (move_group) + mutex_unlock(&gctx->mutex); + mutex_unlock(&ctx->mutex); +/* err_file: */ + fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); @@ -8574,6 +8938,31 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, &f); + if (err) + return ERR_PTR(err); + + event = f.file->private_data; + atomic_long_inc(&event->refcount); + fdput(f); + + return event; +} + +const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + if (!event) + return ERR_PTR(-EINVAL); + + return &event->attr; +} + /* * inherit a event from parent task to child task: */ @@ -8872,7 +9261,7 @@ static void perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; @@ -9071,25 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - task_function_call(task, __perf_cgroup_move, task); -} - struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, - .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b2be01b1aa9d..b5d1ea79c595 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - tail = READ_ONCE_CTRL(rb->user_page->data_tail); + tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) @@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); @@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb) rb->aux_priv = NULL; } - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } } void rb_free_aux(struct ring_buffer *rb) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..4e5e9798aa0c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -105,17 +96,18 @@ struct return_instance { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; @@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); @@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } @@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static struct return_instance *find_next_ret_chain(struct return_instance *ri) { - struct uprobe_task *utask; - struct return_instance *ri, *tmp; bool chained; + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + +static void handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *next; + bool valid; + utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; - - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { - handle_uretprobe_chain(ri, regs); - - chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); - utask->depth--; + goto sigill; - if (!chained) - break; - BUG_ON(!ri); - } + do { + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1b5ef7..07110c6020a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -706,10 +706,12 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) @@ -761,7 +763,9 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(preempt_disable()); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); + TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA diff --git a/kernel/extable.c b/kernel/extable.c index c98f926277a8..e820ccee9846 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -18,7 +18,6 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> -#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> diff --git a/kernel/fork.c b/kernel/fork.c index dbd9b8d7b7cc..f97f2c449f5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + cgroup_free(tsk); task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); @@ -454,8 +455,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= + ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; + tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -1072,6 +1075,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; + atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; @@ -1099,7 +1103,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); - sig->cputimer.running = 1; + sig->cputimer.running = true; } /* The timer lists. */ @@ -1133,6 +1137,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); seqlock_init(&sig->stats_lock); + prev_cputime_init(&sig->prev_cputime); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; @@ -1244,6 +1249,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; + void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1278,10 +1284,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* * If the new process will be in a different pid or user namespace - * do not allow it to share a thread group or signal handlers or - * parent with the forking task. + * do not allow it to share a thread group with the forking task. */ - if (clone_flags & CLONE_SIGHAND) { + if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) @@ -1340,9 +1345,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - p->prev_cputime.utime = p->prev_cputime.stime = 0; -#endif + prev_cputime_init(&p->prev_cputime); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; @@ -1518,6 +1522,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->task_works = NULL; /* + * Ensure that the cgroup subsystem policies allow the new process to be + * forked. It should be noted the the new process's css_set can be changed + * between here and cgroup_post_fork() if an organisation operation is in + * progress. + */ + retval = cgroup_can_fork(p, cgrp_ss_priv); + if (retval) + goto bad_fork_free_pid; + + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ @@ -1553,7 +1567,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_free_pid; + goto bad_fork_cancel_cgroup; } if (likely(p->pid)) { @@ -1595,7 +1609,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, cgrp_ss_priv); if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); @@ -1605,6 +1619,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; +bad_fork_cancel_cgroup: + cgroup_cancel_fork(p, cgrp_ss_priv); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1871,13 +1887,21 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* - * Not implemented, but pretend it works if there is nothing to - * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND - * needs to unshare vm. + * Not implemented, but pretend it works if there is nothing + * to unshare. Note that unsharing the address space or the + * signal handlers also need to unshare the signal queues (aka + * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { - /* FIXME: get_task_mm() increments ->mm_users */ - if (atomic_read(¤t->mm->mm_users) > 1) + if (!thread_group_empty(current)) + return -EINVAL; + } + if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { + if (atomic_read(¤t->sighand->count) > 1) + return -EINVAL; + } + if (unshare_flags & CLONE_VM) { + if (!current_is_single_threaded()) return -EINVAL; } @@ -1941,21 +1965,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int err; /* - * If unsharing a user namespace must also unshare the thread. + * If unsharing a user namespace must also unshare the thread group + * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* - * If unsharing a thread from a thread group, must also unshare vm. - */ - if (unshare_flags & CLONE_THREAD) - unshare_flags |= CLONE_VM; - /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* + * If unsharing a signal handlers, must also unshare the signal queues. + */ + if (unshare_flags & CLONE_SIGHAND) + unshare_flags |= CLONE_THREAD; + /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) diff --git a/kernel/futex.c b/kernel/futex.c index c4a182f5357e..684d7549825a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -64,6 +64,7 @@ #include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/bootmem.h> +#include <linux/fault-inject.h> #include <asm/futex.h> @@ -254,9 +255,78 @@ struct futex_hash_bucket { struct plist_head chain; } ____cacheline_aligned_in_smp; -static unsigned long __read_mostly futex_hashsize; +/* + * The base of the bucket array and its size are always used together + * (after initialization only in hash_futex()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) + + +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; -static struct futex_hash_bucket *futex_queues; + bool ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = false, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +static bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else +static inline bool should_fail_futex(bool fshared) +{ + return false; +} +#endif /* CONFIG_FAIL_FUTEX */ static inline void futex_get_mm(union futex_key *key) { @@ -413,6 +483,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -428,6 +501,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + err = get_user_pages_fast(address, 1, 1, &page); /* * If write access is not required (eg. FUTEX_WAIT), try @@ -516,7 +593,7 @@ again: * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ - if (ro) { + if (unlikely(should_fail_futex(fshared)) || ro) { err = -EFAULT; goto out; } @@ -974,6 +1051,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { u32 uninitialized_var(curval); + if (unlikely(should_fail_futex(true))) + return -EFAULT; + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; @@ -1015,12 +1095,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, if (get_futex_value_locked(&uval, uaddr)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Detect deadlocks. */ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + /* * Lookup existing state first. If it exists, try to attach to * its pi_state. @@ -1155,6 +1241,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) @@ -1457,6 +1546,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + if (unlikely(should_fail_futex(true))) + return -EFAULT; + /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, @@ -2268,8 +2360,11 @@ static long futex_wait_restart(struct restart_block *restart) /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. */ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) @@ -2300,6 +2395,10 @@ retry_private: ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ switch (ret) { case 1: /* We got the lock. */ @@ -2530,7 +2629,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. + * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all @@ -3005,6 +3104,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9a76e3beda54..3b48dab80164 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ config GENERIC_PENDING_IRQ bool +# Support for generic irq migrating off cpu before the cpu is offline. +config GENERIC_IRQ_MIGRATION + bool + # Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index d12123526e2b..2fc9cbdf35b6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 27f4332c7f84..15206453b12a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -21,6 +21,20 @@ #include "internals.h" +static irqreturn_t bad_chained_irq(int irq, void *dev_id) +{ + WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); + return IRQ_NONE; +} + +/* + * Chained handlers should never call action on their IRQ. This default + * action will emit warning if such thing happens. + */ +struct irqaction chained_action = { + .handler = bad_chained_irq, +}; + /** * irq_set_chip - set the irq chip for an irq * @irq: irq number @@ -63,7 +77,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; } @@ -83,7 +97,7 @@ int irq_set_handler_data(unsigned int irq, void *data) if (!desc) return -EINVAL; - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } @@ -105,7 +119,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, if (!desc) return -EINVAL; - desc->irq_data.msi_desc = entry; + desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); @@ -187,7 +201,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_enable(desc); } if (resend) - check_irq_resend(desc, desc->irq_data.irq); + check_irq_resend(desc); return ret; } @@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc) * disabled. If an interrupt happens, then the interrupt flow * handler masks the line at the hardware level and marks it * pending. + * + * If the interrupt chip does not implement the irq_disable callback, + * a driver can disable the lazy approach for a particular irq line by + * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can + * be used for devices which cannot disable the interrupt at the + * device level under certain circumstances and have to use + * disable_irq[_nosync] instead. */ void irq_disable(struct irq_desc *desc) { @@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc) if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); irq_state_set_masked(desc); + } else if (irq_settings_disable_unlazy(desc)) { + mask_irq(desc); } } @@ -315,7 +338,7 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -328,7 +351,7 @@ void handle_nested_irq(unsigned int irq) action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + note_interrupt(desc, action_ret); raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); @@ -372,7 +395,6 @@ static bool irq_may_run(struct irq_desc *desc) /** * handle_simple_irq - Simple and software-decoded IRQs. - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt @@ -382,8 +404,7 @@ static bool irq_may_run(struct irq_desc *desc) * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ -void -handle_simple_irq(unsigned int irq, struct irq_desc *desc) +void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -391,7 +412,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; @@ -425,7 +446,6 @@ static void cond_unmask_irq(struct irq_desc *desc) /** * handle_level_irq - Level type irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has @@ -433,8 +453,7 @@ static void cond_unmask_irq(struct irq_desc *desc) * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ -void -handle_level_irq(unsigned int irq, struct irq_desc *desc) +void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -443,7 +462,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -496,7 +515,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) /** * handle_fasteoi_irq - irq handler for transparent controllers - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() @@ -504,8 +522,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ -void -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; @@ -515,7 +532,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -546,7 +563,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware @@ -560,8 +576,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq); * the handler was running. If all pending interrupts are handled, the * loop is left. */ -void -handle_edge_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); @@ -583,7 +598,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); @@ -618,13 +633,12 @@ EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ -void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); @@ -646,7 +660,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) goto out_eoi; } - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); do { if (unlikely(!desc->action)) @@ -665,22 +679,20 @@ out_eoi: /** * handle_percpu_irq - Per CPU local irq handler - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ -void -handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - handle_irq_event_percpu(desc, desc->action); + handle_irq_event_percpu(desc); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); @@ -688,7 +700,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids - * @irq: the interrupt number * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as @@ -698,14 +709,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) * contain the real device id for the cpu on which this handler is * called */ -void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; void *dev_id = raw_cpu_ptr(action->percpu_dev_id); + unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -757,6 +769,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); + if (is_chained) + desc->action = NULL; desc->depth = 1; } desc->handle_irq = handle; @@ -766,6 +780,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); + desc->action = &chained_action; irq_startup(desc, true); } } @@ -796,7 +811,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, return; __irq_do_set_handler(desc, handle, 1, NULL); - desc->irq_data.handler_data = data; + desc->irq_common_data.handler_data = data; irq_put_desc_busunlock(desc, flags); } @@ -985,6 +1000,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data, } /** + * irq_chip_set_type_parent - Set IRQ type on the parent interrupt + * @data: Pointer to interrupt specific data + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) +{ + data = data->parent_data; + + if (data->chip->irq_set_type) + return data->chip->irq_set_type(data, type); + + return -ENOSYS; +} + +/** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data * @@ -997,13 +1029,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); - return -ENOSYS; + return 0; } /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt * @data: Pointer to interrupt specific data - * @dest: The vcpu affinity information + * @vcpu_info: The vcpu affinity information */ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) { diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c new file mode 100644 index 000000000000..011f8c4c63da --- /dev/null +++ b/kernel/irq/cpuhotplug.c @@ -0,0 +1,82 @@ +/* + * Generic cpu hotunplug interrupt migration code copied from the + * arch/arm implementation + * + * Copyright (C) Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/interrupt.h> +#include <linux/ratelimit.h> +#include <linux/irq.h> + +#include "internals.h" + +static bool migrate_one_irq(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + const struct cpumask *affinity = d->common->affinity; + struct irq_chip *c; + bool ret = false; + + /* + * If this is a per-CPU interrupt, or the affinity does not + * include this CPU, then we have nothing to do. + */ + if (irqd_is_per_cpu(d) || + !cpumask_test_cpu(smp_processor_id(), affinity)) + return false; + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + affinity = cpu_online_mask; + ret = true; + } + + c = irq_data_get_irq_chip(d); + if (!c->irq_set_affinity) { + pr_debug("IRQ%u: unable to set affinity\n", d->irq); + } else { + int r = irq_do_set_affinity(d, affinity, false); + if (r) + pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", + d->irq, r); + } + + return ret; +} + +/** + * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu + * + * The current CPU has been marked offline. Migrate IRQs off this CPU. + * If the affinity settings do not allow other CPUs, force them onto any + * available CPU. + * + * Note: we must iterate over all IRQs, whether they have an attached + * action structure or not, as we need to get chained interrupts too. + */ +void irq_migrate_all_off_this_cpu(void) +{ + unsigned int irq; + struct irq_desc *desc; + unsigned long flags; + + local_irq_save(flags); + + for_each_active_irq(irq) { + bool affinity_broken; + + desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); + affinity_broken = migrate_one_irq(desc); + raw_spin_unlock(&desc->lock); + + if (affinity_broken) + pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", + irq, smp_processor_id()); + } + + local_irq_restore(flags); +} diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 15b370daf234..abd286afbd27 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -553,6 +553,9 @@ static int irq_gc_suspend(void) if (data) ct->chip.irq_suspend(data); } + + if (gc->suspend) + gc->suspend(gc); } return 0; } @@ -564,6 +567,9 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; + if (gc->resume) + gc->resume(gc); + if (ct->chip.irq_resume) { struct irq_data *data = irq_gc_get_irq_data(gc); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 635480270858..a302cf9a2126 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,17 +22,19 @@ /** * handle_bad_irq - handle spurious and unhandled irqs - * @irq: the interrupt number * @desc: description of the interrupt * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(struct irq_desc *desc) { + unsigned int irq = irq_desc_get_irq(desc); + print_irq_desc(irq, desc); - kstat_incr_irqs_this_cpu(irq, desc); + kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); } +EXPORT_SYMBOL_GPL(handle_bad_irq); /* * Special, empty irq handler: @@ -130,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; + struct irqaction *action = desc->action; do { irqreturn_t res; @@ -176,20 +178,19 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) add_interrupt_randomness(irq, flags); if (!noirqdebug) - note_interrupt(irq, desc, retval); + note_interrupt(desc, retval); return retval; } irqreturn_t handle_irq_event(struct irq_desc *desc) { - struct irqaction *action = desc->action; irqreturn_t ret; desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - ret = handle_irq_event_percpu(desc, action); + ret = handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 61008b8433ab..05c2188271b8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,6 +18,8 @@ extern bool noirqdebug; +extern struct irqaction chained_action; + /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run @@ -59,10 +61,9 @@ enum { #include "debug.h" #include "settings.h" -extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); +extern void __disable_irq(struct irq_desc *desc); +extern void __enable_irq(struct irq_desc *desc); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); @@ -82,11 +83,11 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ -void check_irq_resend(struct irq_desc *desc, unsigned int irq); +void check_irq_resend(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); @@ -187,7 +188,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } -static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); @@ -195,7 +196,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d static inline int irq_desc_get_node(struct irq_desc *desc) { - return irq_data_get_node(&desc->irq_data); + return irq_common_data_get_node(&desc->irq_common_data); } #ifdef CONFIG_PM_SLEEP diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4afc457613dd..239e2ae2c947 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void) #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { - if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, + gfp, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif @@ -52,11 +53,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) static void desc_smp_init(struct irq_desc *desc, int node) { - desc->irq_data.node = node; - cpumask_copy(desc->irq_data.affinity, irq_default_affinity); + cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif +#ifdef CONFIG_NUMA + desc->irq_common_data.node = node; +#endif } #else @@ -70,12 +73,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, { int cpu; + desc->irq_common_data.handler_data = NULL; + desc->irq_common_data.msi_desc = NULL; + desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; - desc->irq_data.handler_data = NULL; - desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; @@ -121,7 +125,7 @@ static void free_masks(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif - free_cpumask_var(desc->irq_data.affinity); + free_cpumask_var(desc->irq_common_data.affinity); } #else static inline void free_masks(struct irq_desc *desc) { } @@ -343,7 +347,7 @@ int generic_handle_irq(unsigned int irq) if (!desc) return -EINVAL; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(desc); return 0; } EXPORT_SYMBOL_GPL(generic_handle_irq); @@ -582,7 +586,7 @@ int irq_set_percpu_devid(unsigned int irq) void kstat_incr_irq_this_cpu(unsigned int irq) { - kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c3577fef78c..22aa9612ef7c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); +struct irqchip_fwid { + struct fwnode_handle fwnode; + char *name; + void *data; +}; + +/** + * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for + * identifying an irq domain + * @data: optional user-provided data + * + * Allocate a struct device_node, and return a poiner to the embedded + * fwnode_handle (or NULL on failure). + */ +struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +{ + struct irqchip_fwid *fwid; + char *name; + + fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "irqchip@%p", data); + + if (!fwid || !name) { + kfree(fwid); + kfree(name); + return NULL; + } + + fwid->name = name; + fwid->data = data; + fwid->fwnode.type = FWNODE_IRQCHIP; + return &fwid->fwnode; +} + +/** + * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle + * + * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. + */ +void irq_domain_free_fwnode(struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid; + + if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + return; + + fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + kfree(fwid->name); + kfree(fwid); +} + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain); * Allocates and initialize and irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; + struct device_node *of_node; + + of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; + of_node_get(of_node); + /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; - domain->of_node = of_node_get(of_node); + domain->fwnode = fwnode; domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(domain->of_node); + of_node_put(irq_domain_get_of_node(domain)); kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); @@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); + domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); if (!domain) return NULL; @@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, { struct irq_domain *domain; - domain = __irq_domain_add(of_node, first_hwirq + size, + domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, first_hwirq + size, 0, ops, host_data); if (domain) irq_domain_associate_many(domain, first_irq, first_hwirq, size); @@ -187,10 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_host() - Locates a domain for a given device node - * @node: device-tree node of the interrupt controller + * irq_find_matching_fwnode() - Locates a domain for a given fwnode + * @fwnode: FW descriptor of the interrupt controller + * @bus_token: domain-specific data */ -struct irq_domain *irq_find_host(struct device_node *node) +struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; int rc; @@ -199,13 +257,19 @@ struct irq_domain *irq_find_host(struct device_node *node) * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... + * + * bus_token == DOMAIN_BUS_ANY matches any domain, any other + * values must generate an exact match for the domain to be + * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->match) - rc = h->ops->match(h, node); + rc = h->ops->match(h, to_of_node(fwnode), bus_token); else - rc = (h->of_node != NULL) && (h->of_node == node); + rc = ((fwnode != NULL) && (h->fwnode == fwnode) && + ((bus_token == DOMAIN_BUS_ANY) || + (h->bus_token == bus_token))); if (rc) { found = h; @@ -215,7 +279,7 @@ struct irq_domain *irq_find_host(struct device_node *node) mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_host); +EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); /** * irq_set_default_host() - Set a "default" irq domain @@ -328,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int i; + of_node = irq_domain_get_of_node(domain); pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + of_node_full_name(of_node), irq_base, (int)hwirq_base, count); for (i = 0; i < count; i++) { irq_domain_associate(domain, irq_base + i, hwirq_base + i); @@ -351,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { + struct device_node *of_node; unsigned int virq; if (domain == NULL) domain = irq_default_domain; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + of_node = irq_domain_get_of_node(domain); + virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; @@ -391,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { + struct device_node *of_node; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -404,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("-> using domain @%p\n", domain); + of_node = irq_domain_get_of_node(domain); + /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { @@ -412,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, - of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -425,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, of_node_full_name(domain->of_node), virq); + hwirq, of_node_full_name(of_node), virq); return virq; } @@ -452,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count) { + struct device_node *of_node; int ret; + of_node = irq_domain_get_of_node(domain); ret = irq_alloc_descs(irq_base, irq_base, count, - of_node_to_nid(domain->of_node)); + of_node_to_nid(of_node)); if (unlikely(ret < 0)) return ret; @@ -464,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +static int irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, unsigned int *type) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (d->ops->translate) + return d->ops->translate(d, fwspec, hwirq, type); +#endif + if (d->ops->xlate) + return d->ops->xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); + + /* If domain has no translation, then we assume interrupt line */ + *hwirq = fwspec->param[0]; + return 0; +} + +static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, + struct irq_fwspec *fwspec) +{ + int i; + + fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; + fwspec->param_count = irq_data->args_count; + + for (i = 0; i < irq_data->args_count; i++) + fwspec->param[i] = irq_data->args[i]; +} + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; - domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; + if (fwspec->fwnode) + domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); + else + domain = irq_default_domain; + if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(irq_data->np)); + of_node_full_name(to_of_node(fwspec->fwnode))); return 0; } - /* If domain has no translation, then we assume interrupt line */ - if (domain->ops->xlate == NULL) - hwirq = irq_data->args[0]; - else { - if (domain->ops->xlate(domain, irq_data->np, irq_data->args, - irq_data->args_count, &hwirq, &type)) - return 0; - } + if (irq_domain_translate(domain, fwspec, &hwirq, &type)) + return 0; if (irq_domain_is_hierarchy(domain)) { /* @@ -496,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) if (virq) return virq; - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; } else { @@ -512,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) irq_set_irq_type(virq, type); return virq; } +EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); + +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(irq_data, &fwspec); + return irq_create_fwspec_mapping(&fwspec); +} EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** @@ -582,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private) "name", "mapped", "linear-max", "direct-max", "devtree-node"); mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { + struct device_node *of_node; int count = 0; + of_node = irq_domain_get_of_node(domain); radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - domain->of_node ? of_node_full_name(domain->of_node) : ""); + of_node ? of_node_full_name(of_node) : ""); } mutex_unlock(&irq_domain_mutex); @@ -743,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** - * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy * @parent: Parent irq domain to associate with the new domain * @flags: Irq domain flags associated to the domain * @size: Size of the domain. See below - * @node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @ops: Pointer to the interrupt domain callbacks * @host_data: Controller private data pointer * @@ -757,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, * domain flags are set. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, unsigned int flags, unsigned int size, - struct device_node *node, + struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; if (size) - domain = irq_domain_add_linear(node, size, ops, host_data); + domain = irq_domain_create_linear(fwnode, size, ops, host_data); else - domain = irq_domain_add_tree(node, ops, host_data); + domain = irq_domain_create_tree(fwnode, ops, host_data); if (domain) { domain->parent = parent; domain->flags |= flags; @@ -836,7 +947,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, child->parent_data = irq_data; irq_data->irq = child->irq; irq_data->common = child->common; - irq_data->node = child->node; irq_data->domain = domain; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9744853b656..0eebaeef317b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; +static int __irq_can_set_affinity(struct irq_desc *desc) +{ + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + return 0; + return 1; +} + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity; */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !irqd_can_balance(&desc->irq_data) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - - return 1; + return __irq_can_set_affinity(irq_to_desc(irq)); } /** @@ -190,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: - cpumask_copy(data->affinity, mask); + cpumask_copy(desc->irq_common_data.affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; @@ -256,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); -/** - * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt - * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data - * - * This function uses the vCPU specific data to set the vCPU - * affinity for an irq. The vCPU specific data is passed from - * outside, such as KVM. One example code path is as below: - * KVM -> IOMMU -> irq_set_vcpu_affinity(). - */ -int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - struct irq_data *data; - struct irq_chip *chip; - int ret = -ENOSYS; - - if (!desc) - return -EINVAL; - - data = irq_desc_get_irq_data(desc); - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) - ret = chip->irq_set_vcpu_affinity(data, vcpu_info); - irq_put_desc_unlock(desc, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); - static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = @@ -302,7 +273,7 @@ static void irq_affinity_notify(struct work_struct *work) if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else - cpumask_copy(cpumask, desc->irq_data.affinity); + cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); @@ -359,14 +330,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); /* Excludes PER_CPU and NO_BALANCE interrupts */ - if (!irq_can_set_affinity(irq)) + if (!__irq_can_set_affinity(desc)) return 0; /* @@ -374,9 +344,9 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) * one of the targets is online. */ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { - if (cpumask_intersects(desc->irq_data.affinity, + if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) - set = desc->irq_data.affinity; + set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } @@ -393,10 +363,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) return 0; } #else -static inline int -setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) +/* Wrapper for ALPHA specific affinity selector magic */ +static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) { - return irq_select_affinity(irq); + return irq_select_affinity(irq_desc_get_irq(d)); } #endif @@ -410,20 +380,51 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc, mask); + ret = setup_affinity(desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } #else static inline int -setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +setup_affinity(struct irq_desc *desc, struct cpumask *mask) { return 0; } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq) +/** + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data + * + * This function uses the vCPU specific data to set the vCPU + * affinity for an irq. The vCPU specific data is passed from + * outside, such as KVM. One example code path is as below: + * KVM -> IOMMU -> irq_set_vcpu_affinity(). + */ +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + struct irq_data *data; + struct irq_chip *chip; + int ret = -ENOSYS; + + if (!desc) + return -EINVAL; + + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + ret = chip->irq_set_vcpu_affinity(data, vcpu_info); + irq_put_desc_unlock(desc, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); + +void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); @@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq); + __disable_irq(desc); irq_put_desc_busunlock(desc, flags); return 0; } @@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); -void __enable_irq(struct irq_desc *desc, unsigned int irq) +void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: - WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", + irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) @@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq) /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); irq_enable(desc); - check_irq_resend(desc, irq); + check_irq_resend(desc); /* fall-through */ } default: @@ -546,7 +548,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq); + __enable_irq(desc); out: irq_put_desc_busunlock(desc, flags); } @@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return canrequest; } -int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) +int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; @@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", + irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); + flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); @@ -728,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) return IRQ_NONE; } +static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) +{ + WARN(1, "Secondary action handler called for irq %d\n", irq); + return IRQ_NONE; +} + static int irq_wait_for_interrupt(struct irqaction *action) { set_current_state(TASK_INTERRUPTIBLE); @@ -754,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { - if (!(desc->istate & IRQS_ONESHOT)) + if (!(desc->istate & IRQS_ONESHOT) || + action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); @@ -827,8 +836,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_data.affinity) - cpumask_copy(mask, desc->irq_data.affinity); + if (desc->irq_common_data.affinity) + cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; raw_spin_unlock_irq(&desc->lock); @@ -908,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused) irq_finalize_oneshot(desc, action); } +static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) +{ + struct irqaction *secondary = action->secondary; + + if (WARN_ON_ONCE(!secondary)) + return; + + raw_spin_lock_irq(&desc->lock); + __irq_wake_thread(desc, secondary); + raw_spin_unlock_irq(&desc->lock); +} + /* * Interrupt handler thread */ @@ -938,6 +959,8 @@ static int irq_thread(void *data) action_ret = handler_fn(desc, action); if (action_ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); + if (action_ret == IRQ_WAKE_THREAD) + irq_wake_secondary(desc, action); wake_threads_waitq(desc); } @@ -982,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id) } EXPORT_SYMBOL_GPL(irq_wake_thread); -static void irq_setup_forced_threading(struct irqaction *new) +static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) - return; + return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) - return; + return 0; new->flags |= IRQF_ONESHOT; - if (!new->thread_fn) { - set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); - new->thread_fn = new->handler; - new->handler = irq_default_primary_handler; + /* + * Handle the case where we have a real primary handler and a + * thread handler. We force thread them as well by creating a + * secondary action. + */ + if (new->handler != irq_default_primary_handler && new->thread_fn) { + /* Allocate the secondary action */ + new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!new->secondary) + return -ENOMEM; + new->secondary->handler = irq_forced_secondary_handler; + new->secondary->thread_fn = new->thread_fn; + new->secondary->dev_id = new->dev_id; + new->secondary->irq = new->irq; + new->secondary->name = new->name; } + /* Deal with the primary handler */ + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + return 0; } static int irq_request_resources(struct irq_desc *desc) @@ -1015,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); } +static int +setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) +{ + struct task_struct *t; + struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; + + if (!secondary) { + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + } else { + t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, + new->name); + param.sched_priority -= 1; + } + + if (IS_ERR(t)) + return PTR_ERR(t); + + sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + /* + * Tell the thread to set its affinity. This is + * important for shared interrupt handlers as we do + * not invoke setup_affinity() for the secondary + * handlers as everything is already set up. Even for + * interrupts marked with IRQF_NO_BALANCE this is + * correct as we want the thread to move to the cpu(s) + * on which the requesting code placed the interrupt. + */ + set_bit(IRQTF_AFFINITY, &new->thread_flags); + return 0; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1035,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + new->irq = irq; + /* * Check whether the interrupt nests into another interrupt * thread. @@ -1052,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->handler = irq_nested_primary_handler; } else { - if (irq_settings_can_thread(desc)) - irq_setup_forced_threading(new); + if (irq_settings_can_thread(desc)) { + ret = irq_setup_forced_threading(new); + if (ret) + goto out_mput; + } } /* @@ -1062,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * thread. */ if (new->thread_fn && !nested) { - struct task_struct *t; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - - t = kthread_create(irq_thread, new, "irq/%d-%s", irq, - new->name); - if (IS_ERR(t)) { - ret = PTR_ERR(t); + ret = setup_irq_thread(new, irq, false); + if (ret) goto out_mput; + if (new->secondary) { + ret = setup_irq_thread(new->secondary, irq, true); + if (ret) + goto out_thread; } - - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); - - /* - * We keep the reference to the task struct even if - * the thread dies to avoid that the interrupt code - * references an already freed task_struct. - */ - get_task_struct(t); - new->thread = t; - /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. - */ - set_bit(IRQTF_AFFINITY, &new->thread_flags); } if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1221,8 +1284,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc, irq, - new->flags & IRQF_TRIGGER_MASK); + ret = __irq_set_trigger(desc, + new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_mask; @@ -1253,7 +1316,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc, mask); + setup_affinity(desc, mask); } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; @@ -1265,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq, nmsk, omsk); } - new->irq = irq; *old_ptr = new; irq_pm_install_action(desc, new); @@ -1280,7 +1342,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq); + __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -1291,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread) wake_up_process(new->thread); + if (new->secondary) + wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); new->dir = NULL; @@ -1321,6 +1385,13 @@ out_thread: kthread_stop(t); put_task_struct(t); } + if (new->secondary && new->secondary->thread) { + struct task_struct *t = new->secondary->thread; + + new->secondary->thread = NULL; + kthread_stop(t); + put_task_struct(t); + } out_mput: module_put(desc->owner); return ret; @@ -1392,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { + irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); irq_release_resources(desc); } @@ -1428,9 +1500,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (action->thread) { kthread_stop(action->thread); put_task_struct(action->thread); + if (action->secondary && action->secondary->thread) { + kthread_stop(action->secondary->thread); + put_task_struct(action->secondary->thread); + } } module_put(desc->owner); + kfree(action->secondary); return action; } @@ -1574,8 +1651,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + kfree(action->secondary); kfree(action); + } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { @@ -1650,7 +1729,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (type != IRQ_TYPE_NONE) { int ret; - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, type); if (ret) { WARN(1, "failed to set type for IRQ%d\n", irq); @@ -1759,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } +EXPORT_SYMBOL_GPL(free_percpu_irq); /** * setup_percpu_irq - setup a per-cpu interrupt @@ -1788,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources, but doesn't - * automatically enable the interrupt. It has to be done on each - * CPU using enable_percpu_irq(). + * This call allocates interrupt resources and enables the + * interrupt on the local CPU. If the interrupt is supposed to be + * enabled on other CPUs, it has to be done on each CPU using + * enable_percpu_irq(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of @@ -1829,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } +EXPORT_SYMBOL_GPL(request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. @@ -1875,6 +1957,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. @@ -1920,3 +2003,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, irq_put_desc_busunlock(desc, flags); return err; } +EXPORT_SYMBOL_GPL(irq_set_irqchip_state); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7bf1f1bbb7fa..6b0c0b74a2a1 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,6 +18,23 @@ /* Temparory solution for building, will be removed later */ #include <linux/pci.h> +struct msi_desc *alloc_msi_entry(struct device *dev) +{ + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return NULL; + + INIT_LIST_HEAD(&desc->list); + desc->dev = dev; + + return desc; +} + +void free_msi_entry(struct msi_desc *entry) +{ + kfree(entry); +} + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { *msg = entry->msg; @@ -211,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) { struct irq_chip *chip = info->chip; - BUG_ON(!chip); - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; + BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); if (!chip->irq_set_affinity) chip->irq_set_affinity = msi_domain_set_affinity; } /** * msi_create_irq_domain - Create a MSI interrupt domain - * @of_node: Optional device-tree node of the interrupt controller + * @fwnode: Optional fwnode of the interrupt controller * @info: MSI domain info * @parent: Parent irq domain */ -struct irq_domain *msi_create_irq_domain(struct device_node *node, +struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { @@ -235,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, - info); + return irq_domain_create_hierarchy(parent, 0, 0, fwnode, + &msi_domain_ops, info); } /** diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d22786a6dbde..e80c4400118a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc) desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); - pm_system_wakeup(); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; @@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth--; } -static bool suspend_device_irq(struct irq_desc *desc, int irq) +static bool suspend_device_irq(struct irq_desc *desc) { if (!desc->action || desc->no_suspend_depth) return false; @@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) } desc->istate |= IRQS_SUSPENDED; - __disable_irq(desc, irq); + __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility @@ -126,7 +126,7 @@ void suspend_device_irqs(void) if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc, irq); + sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) @@ -135,7 +135,7 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -static void resume_irq(struct irq_desc *desc, int irq) +static void resume_irq(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); @@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq) desc->depth++; resume: desc->istate &= ~IRQS_SUSPENDED; - __enable_irq(desc, irq); + __enable_irq(desc); } static void resume_irqs(bool want_early) @@ -169,7 +169,7 @@ static void resume_irqs(bool want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); - resume_irq(desc, irq); + resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 0e97c142ce40..a916cf144b65 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/mutex.h> #include "internals.h" @@ -39,7 +40,7 @@ static struct proc_dir_entry *root_irq_dir; static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_data.affinity; + const struct cpumask *mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (irqd_is_setaffinity_pending(&desc->irq_data)) @@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if (!action && !any_count) + if ((!action || action == &chained_action) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 7a5237a1bce5..b86886beee4f 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_to_desc(irq); local_irq_disable(); - desc->handle_irq(irq, desc); + desc->handle_irq(desc); local_irq_enable(); } } @@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); * * Is called with interrupts disabled and desc->lock held. */ -void check_irq_resend(struct irq_desc *desc, unsigned int irq) +void check_irq_resend(struct irq_desc *desc) { /* * We do not resend level type interrupts. Level type @@ -74,6 +74,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND + unsigned int irq = irq_desc_get_irq(desc); + /* * If the interrupt is running in the thread * context of the parent irq we need to be diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 3320b84cc60f..320579d89091 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,6 +15,7 @@ enum { _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, + _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -28,6 +29,7 @@ enum { #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON +#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_IS_POLLED; } + +static inline bool irq_settings_disable_unlazy(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY; +} + +static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e2514b0e439e..32144175458d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc, bool force) +static int try_one_irq(struct irq_desc *desc, bool force) { irqreturn_t ret = IRQ_NONE; struct irqaction *action; @@ -133,7 +133,7 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc, false)) + if (try_one_irq(desc, false)) ok = 1; } out: @@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy) continue; local_irq_disable(); - try_one_irq(i, desc, true); + try_one_irq(desc, true); local_irq_enable(); } out: @@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret) * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) */ -static void -__report_bad_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; unsigned long flags; @@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_unlock_irqrestore(&desc->lock, flags); } -static void -report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { static int count = 100; if (count > 0) { count--; - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); } } @@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, #define SPURIOUS_DEFERRED 0x80000000 -void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { + unsigned int irq; + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; if (bad_action_ret(action_ret)) { - report_bad_irq(irq, desc, action_ret); + report_bad_irq(desc, action_ret); return; } @@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->last_unhandled = jiffies; } + irq = irq_desc_get_irq(desc); if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) @@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * The interrupt is stuck */ - __report_bad_irq(irq, desc, action_ret); + __report_bad_irq(desc, action_ret); /* * Now kill the IRQ */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 52ebaca1b9fc..f7dd15d537f9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct static_key *key, int enable); +static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { @@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_ENABLE); - else - jump_label_update(key, JUMP_LABEL_DISABLE); - } - atomic_inc(&key->enabled); + if (atomic_inc_return(&key->enabled) == 1) + jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key, atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); } else { - if (!jump_label_get_branch_default(key)) - jump_label_update(key, JUMP_LABEL_DISABLE); - else - jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_update(key); } jump_label_unlock(); } @@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* +/* * Update code which is definitely not currently executing. * Architectures which need heavyweight synchronization to modify * running code can override this to make the non-live update case @@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - arch_jump_label_transform(entry, type); + arch_jump_label_transform(entry, type); +} + +static inline struct jump_entry *static_key_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); +} + +static inline bool static_key_type(struct static_key *key) +{ + return (unsigned long)key->entries & JUMP_TYPE_MASK; +} + +static inline struct static_key *jump_entry_key(struct jump_entry *entry) +{ + return (struct static_key *)((unsigned long)entry->key & ~1UL); +} + +static bool jump_entry_branch(struct jump_entry *entry) +{ + return (unsigned long)entry->key & 1UL; +} + +static enum jump_label_type jump_label_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool enabled = static_key_enabled(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return enabled ^ branch; } static void __jump_label_update(struct static_key *key, struct jump_entry *entry, - struct jump_entry *stop, int enable) + struct jump_entry *stop) { - for (; (entry < stop) && - (entry->key == (jump_label_t)(unsigned long)key); - entry++) { + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* * entry->code set to 0 invalidates module init text sections * kernel_text_address() verifies we are not in core kernel * init code, see jump_label_invalidate_module_init(). */ if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, enable); + arch_jump_label_transform(entry, jump_label_type(entry)); } } -static enum jump_label_type jump_label_type(struct static_key *key) -{ - bool true_branch = jump_label_get_branch_default(key); - bool state = static_key_enabled(key); - - if ((!true_branch && state) || (true_branch && !state)) - return JUMP_LABEL_ENABLE; - - return JUMP_LABEL_DISABLE; -} - void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; @@ -202,8 +211,11 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_type(iterk)); + /* rewrite NOPs */ + if (jump_label_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); + + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -222,6 +234,16 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES +static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +{ + struct static_key *key = jump_entry_key(entry); + bool type = static_key_type(key); + bool branch = jump_entry_branch(entry); + + /* See the comment in linux/jump_label.h */ + return type ^ branch; +} + struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; @@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct static_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key) { - struct static_key_mod *mod = key->next; + struct static_key_mod *mod; - while (mod) { + for (mod = key->next; mod; mod = mod->next) { struct module *m = mod->mod; __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries, - enable); - mod = mod->next; + m->jump_entries + m->num_jump_entries); } } @@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + /* Only write NOPs for arch_branch_static(). */ + if (jump_label_init_type(iter) == JUMP_LABEL_NOP) + arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); } } @@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; - iterk = (struct static_key *)(unsigned long)iter->key; + iterk = jump_entry_key(iter); if (iterk == key) continue; @@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod) jlm->next = key->next; key->next = jlm; - if (jump_label_type(key) == JUMP_LABEL_ENABLE) - __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); + /* Only update if we've changed from our initial state */ + if (jump_label_type(iter) != jump_label_init_type(iter)) + __jump_label_update(key, iter, iter_stop); } return 0; @@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod) struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) + if (jump_entry_key(iter) == key) continue; - key = (struct static_key *)(unsigned long)iter->key; + key = jump_entry_key(iter); if (within_module(iter->key, mod)) continue; @@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct static_key *key, int enable) +static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = jump_label_get_entries(key); + struct jump_entry *entry = static_key_entries(key); #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key, enable); + __jump_label_mod_update(key); preempt_disable(); mod = __module_address((unsigned long)key); @@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable) #endif /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, enable); + __jump_label_update(key, entry, stop); } -#endif +#ifdef CONFIG_STATIC_KEYS_SELFTEST +static DEFINE_STATIC_KEY_TRUE(sk_true); +static DEFINE_STATIC_KEY_FALSE(sk_false); + +static __init int jump_label_test(void) +{ + int i; + + for (i = 0; i < 2; i++) { + WARN_ON(static_key_enabled(&sk_true.key) != true); + WARN_ON(static_key_enabled(&sk_false.key) != false); + + WARN_ON(!static_branch_likely(&sk_true)); + WARN_ON(!static_branch_unlikely(&sk_true)); + WARN_ON(static_branch_likely(&sk_false)); + WARN_ON(static_branch_unlikely(&sk_false)); + + static_branch_disable(&sk_true); + static_branch_enable(&sk_false); + + WARN_ON(static_key_enabled(&sk_true.key) == true); + WARN_ON(static_key_enabled(&sk_false.key) == false); + + WARN_ON(static_branch_likely(&sk_true)); + WARN_ON(static_branch_unlikely(&sk_true)); + WARN_ON(!static_branch_likely(&sk_false)); + WARN_ON(!static_branch_unlikely(&sk_false)); + + static_branch_enable(&sk_true); + static_branch_disable(&sk_false); + } + + return 0; +} +late_initcall(jump_label_test); +#endif /* STATIC_KEYS_SELFTEST */ + +#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kexec.c b/kernel/kexec.c index a785c1015e25..4c5edc357923 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,156 +1,22 @@ /* - * kexec.c - kexec system call + * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt - #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> -#include <linux/slab.h> -#include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/reboot.h> -#include <linux/ioport.h> -#include <linux/hardirq.h> -#include <linux/elf.h> -#include <linux/elfcore.h> -#include <linux/utsname.h> -#include <linux/numa.h> -#include <linux/suspend.h> -#include <linux/device.h> -#include <linux/freezer.h> -#include <linux/pm.h> -#include <linux/cpu.h> -#include <linux/console.h> #include <linux/vmalloc.h> -#include <linux/swap.h> -#include <linux/syscore_ops.h> -#include <linux/compiler.h> -#include <linux/hugetlb.h> - -#include <asm/page.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/sections.h> - -#include <crypto/hash.h> -#include <crypto/sha.h> - -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - -/* Flag to indicate we are going to kexec a new kernel */ -bool kexec_in_progress = false; - -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - -#ifdef CONFIG_KEXEC_FILE -static int kexec_calculate_store_digests(struct kimage *image); -#endif - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in do_exit() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -/* - * When kexec transitions to the new kernel there is a one-to-one - * mapping between physical and virtual addresses. On processors - * where you can disable the MMU this is trivial, and easy. For - * others it is still a simple predictable page table to setup. - * - * In that environment kexec copies the new kernel to its final - * resting place. This means I can only support memory whose - * physical address can fit in an unsigned long. In particular - * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. - * If the assembly stub has more restrictive requirements - * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be - * defined more restrictively in <asm/kexec.h>. - * - * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single - * page of memory is necessary, but some architectures require more. - * Because this memory must be identity mapped in the transition from - * virtual to physical addresses it must live in the range - * 0 - TASK_SIZE, as only the user space mappings are arbitrarily - * modifiable. - * - * The assembly stub in the control code buffer is passed a linked list - * of descriptor pages detailing the source pages of the new kernel, - * and the destination addresses of those source pages. As this data - * structure is not used in the context of the current OS, it must - * be self-contained. - * - * The code has been made to work with highmem pages and will use a - * destination page in its final resting place (if it happens - * to allocate it). The end product of this is that most of the - * physical address space, and most of RAM can be used. - * - * Future directions include: - * - allocating a page table with the control code buffer identity - * mapped, to simplify machine_kexec and make kexec_on_panic more - * reliable. - */ - -/* - * KIMAGE_NO_DEST is an impossible destination address..., for - * allocating pages whose destination address we do not care about. - */ -#define KIMAGE_NO_DEST (-1UL) +#include <linux/slab.h> -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long dest); +#include "kexec_internal.h" static int copy_user_segment_list(struct kimage *image, unsigned long nr_segments, @@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image, return ret; } -static int sanity_check_segment_list(struct kimage *image) -{ - int result, i; - unsigned long nr_segments = image->nr_segments; - - /* - * Verify we have good destination addresses. The caller is - * responsible for making certain we don't attempt to load - * the new image into invalid or reserved areas of RAM. This - * just verifies it is an address we can use. - * - * Since the kernel does everything in page size chunks ensure - * the destination addresses are page aligned. Too many - * special cases crop of when we don't do this. The most - * insidious is getting overlapping destination addresses - * simply because addresses are changed to page size - * granularity. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; - if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; - } - - /* Verify our destination addresses do not overlap. - * If we alloed overlapping destination addresses - * through very weird things can happen with no - * easy explanation as one segment stops on another. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - unsigned long j; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - for (j = 0; j < i; j++) { - unsigned long pstart, pend; - pstart = image->segment[j].mem; - pend = pstart + image->segment[j].memsz; - /* Do the segments overlap ? */ - if ((mend > pstart) && (mstart < pend)) - return result; - } - } - - /* Ensure our buffer sizes are strictly less than - * our memory sizes. This should always be the case, - * and it is easier to check up front than to be surprised - * later on. - */ - result = -EINVAL; - for (i = 0; i < nr_segments; i++) { - if (image->segment[i].bufsz > image->segment[i].memsz) - return result; - } - - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - - if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; - } - } - - return 0; -} - -static struct kimage *do_kimage_alloc_init(void) -{ - struct kimage *image; - - /* Allocate a controlling structure */ - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - return NULL; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); - - return image; -} - -static void kimage_free_page_list(struct list_head *list); - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -354,873 +101,6 @@ out_free_image: return ret; } -#ifdef CONFIG_KEXEC_FILE -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -ENOEXEC; -} - -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return ERR_PTR(-ENOEXEC); -} - -void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ -} - -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return -EKEYREJECTED; -} - -/* Apply relocations of type RELA */ -int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* Apply relocations of type REL */ -int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * Free up memory used by kernel, initrd, and command line. This is temporary - * memory allocation which is not needed any more after these buffers have - * been loaded into separate segments and have been copied elsewhere. - */ -static void kimage_file_post_load_cleanup(struct kimage *image) -{ - struct purgatory_info *pi = &image->purgatory_info; - - vfree(image->kernel_buf); - image->kernel_buf = NULL; - - vfree(image->initrd_buf); - image->initrd_buf = NULL; - - kfree(image->cmdline_buf); - image->cmdline_buf = NULL; - - vfree(pi->purgatory_buf); - pi->purgatory_buf = NULL; - - vfree(pi->sechdrs); - pi->sechdrs = NULL; - - /* See if architecture has anything to cleanup post load */ - arch_kimage_file_post_load_cleanup(image); - - /* - * Above call should have called into bootloader to free up - * any data stored in kimage->image_loader_data. It should - * be ok now to free it up. - */ - kfree(image->image_loader_data); - image->image_loader_data = NULL; -} - -/* - * In file mode list of segments is prepared by kernel. Copy relevant - * data from user space, do error checking, prepare segment list - */ -static int -kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, - const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned flags) -{ - int ret = 0; - void *ldata; - - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); - if (ret) - return ret; - - /* Call arch image probe handlers */ - ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, - image->kernel_buf_len); - - if (ret) - goto out; - -#ifdef CONFIG_KEXEC_VERIFY_SIG - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); - if (ret) { - pr_debug("kernel signature verification failed.\n"); - goto out; - } - pr_debug("kernel signature verification successful.\n"); -#endif - /* It is possible that there no initramfs is being loaded */ - if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); - if (ret) - goto out; - } - - if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; - goto out; - } - - image->cmdline_buf_len = cmdline_len; - - /* command line should be a string with last byte null */ - if (image->cmdline_buf[cmdline_len - 1] != '\0') { - ret = -EINVAL; - goto out; - } - } - - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); - - if (IS_ERR(ldata)) { - ret = PTR_ERR(ldata); - goto out; - } - - image->image_loader_data = ldata; -out: - /* In case of error, free up all allocated memory in this function */ - if (ret) - kimage_file_post_load_cleanup(image); - return ret; -} - -static int -kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, - int initrd_fd, const char __user *cmdline_ptr, - unsigned long cmdline_len, unsigned long flags) -{ - int ret; - struct kimage *image; - bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; - - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->file_mode = 1; - - if (kexec_on_panic) { - /* Enable special crash kernel control page alloc policy. */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - } - - ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, - cmdline_ptr, cmdline_len, flags); - if (ret) - goto out_free_image; - - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_post_load_bufs; - - ret = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_post_load_bufs; - } - - if (!kexec_on_panic) { - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; - } - } - - *rimage = image; - return 0; -out_free_control_pages: - kimage_free_page_list(&image->control_pages); -out_free_post_load_bufs: - kimage_file_post_load_cleanup(image); -out_free_image: - kfree(image); - return ret; -} -#else /* CONFIG_KEXEC_FILE */ -static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -#endif /* CONFIG_KEXEC_FILE */ - -static int kimage_is_destination_range(struct kimage *image, - unsigned long start, - unsigned long end) -{ - unsigned long i; - - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) - return 1; - } - - return 0; -} - -static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) -{ - struct page *pages; - - pages = alloc_pages(gfp_mask, order); - if (pages) { - unsigned int count, i; - pages->mapping = NULL; - set_page_private(pages, order); - count = 1 << order; - for (i = 0; i < count; i++) - SetPageReserved(pages + i); - } - - return pages; -} - -static void kimage_free_pages(struct page *page) -{ - unsigned int order, count, i; - - order = page_private(page); - count = 1 << order; - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); -} - -static void kimage_free_page_list(struct list_head *list) -{ - struct list_head *pos, *next; - - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); - list_del(&page->lru); - kimage_free_pages(page); - } -} - -static struct page *kimage_alloc_normal_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * At worst this runs in O(N) of the image size. - */ - struct list_head extra_pages; - struct page *pages; - unsigned int count; - - count = 1 << order; - INIT_LIST_HEAD(&extra_pages); - - /* Loop while I can allocate a page and the page allocated - * is a destination page. - */ - do { - unsigned long pfn, epfn, addr, eaddr; - - pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); - if (!pages) - break; - pfn = page_to_pfn(pages); - epfn = pfn + count; - addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; - if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) { - list_add(&pages->lru, &extra_pages); - pages = NULL; - } - } while (!pages); - - if (pages) { - /* Remember the allocated page... */ - list_add(&pages->lru, &image->control_pages); - - /* Because the page is already in it's destination - * location we will never allocate another page at - * that address. Therefore kimage_alloc_pages - * will not return it (again) and we don't need - * to give it an entry in image->segment[]. - */ - } - /* Deal with the destination pages I have inadvertently allocated. - * - * Ideally I would convert multi-page allocations into single - * page allocations, and add everything to image->dest_pages. - * - * For now it is simpler to just free the pages. - */ - kimage_free_page_list(&extra_pages); - - return pages; -} - -static struct page *kimage_alloc_crash_control_pages(struct kimage *image, - unsigned int order) -{ - /* Control pages are special, they are the intermediaries - * that are needed while we copy the rest of the pages - * to their final resting place. As such they must - * not conflict with either the destination addresses - * or memory the kernel is already using. - * - * Control pages are also the only pags we must allocate - * when loading a crash kernel. All of the other pages - * are specified by the segments and we just memcpy - * into them directly. - * - * The only case where we really need more than one of - * these are for architectures where we cannot disable - * the MMU and must instead generate an identity mapped - * page table for all of the memory. - * - * Given the low demand this implements a very simple - * allocator that finds the first hole of the appropriate - * size in the reserved memory region, and allocates all - * of the memory up to and including the hole. - */ - unsigned long hole_start, hole_end, size; - struct page *pages; - - pages = NULL; - size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - while (hole_end <= crashk_res.end) { - unsigned long i; - - if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) - break; - /* See if I overlap any of the segments */ - for (i = 0; i < image->nr_segments; i++) { - unsigned long mstart, mend; - - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - if ((hole_end >= mstart) && (hole_start <= mend)) { - /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); - hole_end = hole_start + size - 1; - break; - } - } - /* If I don't overlap any segments I have found my hole! */ - if (i == image->nr_segments) { - pages = pfn_to_page(hole_start >> PAGE_SHIFT); - break; - } - } - if (pages) - image->control_page = hole_end; - - return pages; -} - - -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - struct page *pages = NULL; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - pages = kimage_alloc_normal_control_pages(image, order); - break; - case KEXEC_TYPE_CRASH: - pages = kimage_alloc_crash_control_pages(image, order); - break; - } - - return pages; -} - -static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) -{ - if (*image->entry != 0) - image->entry++; - - if (image->entry == image->last_entry) { - kimage_entry_t *ind_page; - struct page *page; - - page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) - return -ENOMEM; - - ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; - image->entry = ind_page; - image->last_entry = ind_page + - ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); - } - *image->entry = entry; - image->entry++; - *image->entry = 0; - - return 0; -} - -static int kimage_set_destination(struct kimage *image, - unsigned long destination) -{ - int result; - - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - - return result; -} - - -static int kimage_add_page(struct kimage *image, unsigned long page) -{ - int result; - - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - - return result; -} - - -static void kimage_free_extra_pages(struct kimage *image) -{ - /* Walk through and free any extra destination pages I may have */ - kimage_free_page_list(&image->dest_pages); - - /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unusable_pages); - -} -static void kimage_terminate(struct kimage *image) -{ - if (*image->entry != 0) - image->entry++; - - *image->entry = IND_DONE; -} - -#define for_each_kimage_entry(image, ptr, entry) \ - for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) - -static void kimage_free_entry(kimage_entry_t entry) -{ - struct page *page; - - page = pfn_to_page(entry >> PAGE_SHIFT); - kimage_free_pages(page); -} - -static void kimage_free(struct kimage *image) -{ - kimage_entry_t *ptr, entry; - kimage_entry_t ind = 0; - - if (!image) - return; - - kimage_free_extra_pages(image); - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_INDIRECTION) { - /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - /* Save this indirection page until we are - * done with it. - */ - ind = entry; - } else if (entry & IND_SOURCE) - kimage_free_entry(entry); - } - /* Free the final indirection page */ - if (ind & IND_INDIRECTION) - kimage_free_entry(ind); - - /* Handle any machine specific cleanup */ - machine_kexec_cleanup(image); - - /* Free the kexec control pages... */ - kimage_free_page_list(&image->control_pages); - - /* - * Free up any temporary buffers allocated. This might hit if - * error occurred much later after buffer allocation. - */ - if (image->file_mode) - kimage_file_post_load_cleanup(image); - - kfree(image); -} - -static kimage_entry_t *kimage_dst_used(struct kimage *image, - unsigned long page) -{ - kimage_entry_t *ptr, entry; - unsigned long destination = 0; - - for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) - destination = entry & PAGE_MASK; - else if (entry & IND_SOURCE) { - if (page == destination) - return ptr; - destination += PAGE_SIZE; - } - } - - return NULL; -} - -static struct page *kimage_alloc_page(struct kimage *image, - gfp_t gfp_mask, - unsigned long destination) -{ - /* - * Here we implement safeguards to ensure that a source page - * is not copied to its destination page before the data on - * the destination page is no longer useful. - * - * To do this we maintain the invariant that a source page is - * either its own destination page, or it is not a - * destination page at all. - * - * That is slightly stronger than required, but the proof - * that no problems will not occur is trivial, and the - * implementation is simply to verify. - * - * When allocating all pages normally this algorithm will run - * in O(N) time, but in the worst case it will run in O(N^2) - * time. If the runtime is a problem the data structures can - * be fixed. - */ - struct page *page; - unsigned long addr; - - /* - * Walk through the list of destination pages, and see if I - * have a match. - */ - list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; - if (addr == destination) { - list_del(&page->lru); - return page; - } - } - page = NULL; - while (1) { - kimage_entry_t *old; - - /* Allocate a page, if we run out of memory give up */ - page = kimage_alloc_pages(gfp_mask, 0); - if (!page) - return NULL; - /* If the page cannot be used file it away */ - if (page_to_pfn(page) > - (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unusable_pages); - continue; - } - addr = page_to_pfn(page) << PAGE_SHIFT; - - /* If it is the destination page we want use it */ - if (addr == destination) - break; - - /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) - break; - - /* - * I know that the page is someones destination page. - * See if there is already a source page for this - * destination page. And if so swap the source pages. - */ - old = kimage_dst_used(image, addr); - if (old) { - /* If so move it */ - unsigned long old_addr; - struct page *old_page; - - old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); - copy_highpage(page, old_page); - *old = addr | (*old & ~PAGE_MASK); - - /* The old page I have found cannot be a - * destination page, so return it if it's - * gfp_flags honor the ones passed in. - */ - if (!(gfp_mask & __GFP_HIGHMEM) && - PageHighMem(old_page)) { - kimage_free_pages(old_page); - continue; - } - addr = old_addr; - page = old_page; - break; - } else { - /* Place the page on the destination list I - * will use it later. - */ - list_add(&page->lru, &image->dest_pages); - } - } - - return page; -} - -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) -{ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - - result = kimage_set_destination(image, maddr); - if (result < 0) - goto out; - - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (!page) { - result = -ENOMEM; - goto out; - } - result = kimage_add_page(image, page_to_pfn(page) - << PAGE_SHIFT); - if (result < 0) - goto out; - - ptr = kmap(page); - /* Start with a clear page */ - clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) -{ - /* For crash dumps kernels we simply copy the data from - * user space to it's destination. - * We do things a page at a time for the sake of kmap. - */ - unsigned long maddr; - size_t ubytes, mbytes; - int result; - unsigned char __user *buf = NULL; - unsigned char *kbuf = NULL; - - result = 0; - if (image->file_mode) - kbuf = segment->kbuf; - else - buf = segment->buf; - ubytes = segment->bufsz; - mbytes = segment->memsz; - maddr = segment->mem; - while (mbytes) { - struct page *page; - char *ptr; - size_t uchunk, mchunk; - - page = pfn_to_page(maddr >> PAGE_SHIFT); - if (!page) { - result = -ENOMEM; - goto out; - } - ptr = kmap(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); - uchunk = min(ubytes, mchunk); - if (mchunk > uchunk) { - /* Zero the trailing part of the page */ - memset(ptr + uchunk, 0, mchunk - uchunk); - } - - /* For file based kexec, source pages are in kernel memory */ - if (image->file_mode) - memcpy(ptr, kbuf, uchunk); - else - result = copy_from_user(ptr, buf, uchunk); - kexec_flush_icache_page(page); - kunmap(page); - if (result) { - result = -EFAULT; - goto out; - } - ubytes -= uchunk; - maddr += mchunk; - if (image->file_mode) - kbuf += mchunk; - else - buf += mchunk; - mbytes -= mchunk; - } -out: - return result; -} - -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - int result = -ENOMEM; - - switch (image->type) { - case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); - break; - case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); - break; - } - - return result; -} - /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image; -struct kimage *kexec_crash_image; -int kexec_load_disabled; - -static DEFINE_MUTEX(kexec_mutex); SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) @@ -1340,18 +215,6 @@ out: return result; } -/* - * Add and remove page tables for crashkernel memory - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, nr_segments, @@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return sys_kexec_load(entry, nr_segments, ksegments, flags); } #endif - -#ifdef CONFIG_KEXEC_FILE -SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, - unsigned long, cmdline_len, const char __user *, cmdline_ptr, - unsigned long, flags) -{ - int ret = 0, i; - struct kimage **dest_image, *image; - - /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) - return -EPERM; - - /* Make sure we have a legal set of flags */ - if (flags != (flags & KEXEC_FILE_FLAGS)) - return -EINVAL; - - image = NULL; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - - dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) - dest_image = &kexec_crash_image; - - if (flags & KEXEC_FILE_UNLOAD) - goto exchange; - - /* - * In case of crash, new kernel gets loaded in reserved region. It is - * same memory where old crash kernel might be loaded. Free any - * current crash dump kernel before we corrupt it. - */ - if (flags & KEXEC_FILE_ON_CRASH) - kimage_free(xchg(&kexec_crash_image, NULL)); - - ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, - cmdline_len, flags); - if (ret) - goto out; - - ret = machine_kexec_prepare(image); - if (ret) - goto out; - - ret = kexec_calculate_store_digests(image); - if (ret) - goto out; - - for (i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); - - ret = kimage_load_segment(image, &image->segment[i]); - if (ret) - goto out; - } - - kimage_terminate(image); - - /* - * Free up any temporary buffers allocated which are not needed - * after image has been loaded - */ - kimage_file_post_load_cleanup(image); -exchange: - image = xchg(dest_image, image); -out: - mutex_unlock(&kexec_mutex); - kimage_free(image); - return ret; -} - -#endif /* CONFIG_KEXEC_FILE */ - -void crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_mutex here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (mutex_trylock(&kexec_mutex)) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - mutex_unlock(&kexec_mutex); - } -} - -size_t crash_get_memory_size(void) -{ - size_t size = 0; - mutex_lock(&kexec_mutex); - if (crashk_res.end != crashk_res.start) - size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); - return size; -} - -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long start, end; - unsigned long old_size; - struct resource *ram_res; - - mutex_lock(&kexec_mutex); - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - start = crashk_res.start; - end = crashk_res.end; - old_size = (end == 0) ? 0 : end - start + 1; - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) { - ret = -ENOMEM; - goto unlock; - } - - start = roundup(start, KEXEC_CRASH_MEM_ALIGN); - end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - - crash_map_reserved_pages(); - crash_free_reserved_phys_range(end, crashk_res.end); - - if ((start == end) && (crashk_res.parent != NULL)) - release_resource(&crashk_res); - - ram_res->start = end; - ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; - ram_res->name = "System RAM"; - - crashk_res.end = end - 1; - - insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); - -unlock: - mutex_unlock(&kexec_mutex); - return ret; -} - -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - crash_notes = alloc_percpu(note_buf_t); - if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char\n"); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -unsigned long __weak paddr_vmcoreinfo_note(void) -{ - return __pa((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - -#ifdef CONFIG_KEXEC_FILE -static int locate_mem_hole_top_down(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; - - do { - /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); - - if (temp_start < start || temp_start < kbuf->buf_min) - return 0; - - temp_end = temp_start + kbuf->memsz - 1; - - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start - PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, - struct kexec_buf *kbuf) -{ - struct kimage *image = kbuf->image; - unsigned long temp_start, temp_end; - - temp_start = max(start, kbuf->buf_min); - - do { - temp_start = ALIGN(temp_start, kbuf->buf_align); - temp_end = temp_start + kbuf->memsz - 1; - - if (temp_end > end || temp_end > kbuf->buf_max) - return 0; - /* - * Make sure this does not conflict with any of existing - * segments - */ - if (kimage_is_destination_range(image, temp_start, temp_end)) { - temp_start = temp_start + PAGE_SIZE; - continue; - } - - /* We found a suitable memory range */ - break; - } while (1); - - /* If we are here, we found a suitable memory range */ - kbuf->mem = temp_start; - - /* Success, stop navigating through remaining System RAM ranges */ - return 1; -} - -static int locate_mem_hole_callback(u64 start, u64 end, void *arg) -{ - struct kexec_buf *kbuf = (struct kexec_buf *)arg; - unsigned long sz = end - start + 1; - - /* Returning 0 will take to next memory range */ - if (sz < kbuf->memsz) - return 0; - - if (end < kbuf->buf_min || start > kbuf->buf_max) - return 0; - - /* - * Allocate memory top down with-in ram range. Otherwise bottom up - * allocation. - */ - if (kbuf->top_down) - return locate_mem_hole_top_down(start, end, kbuf); - return locate_mem_hole_bottom_up(start, end, kbuf); -} - -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. - */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) -{ - - struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; - int ret; - - /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) - return -EINVAL; - - if (image->nr_segments >= KEXEC_SEGMENT_MAX) - return -EINVAL; - - /* - * Make sure we are not trying to add buffer after allocating - * control pages. All segments need to be placed first before - * any control pages are allocated. As control page allocation - * logic goes through list of segments to make sure there are - * no destination overlaps. - */ - if (!list_empty(&image->control_pages)) { - WARN_ON(1); - return -EINVAL; - } - - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; - - /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } - - /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; - ksegment->kbuf = kbuf->buffer; - ksegment->bufsz = kbuf->bufsz; - ksegment->mem = kbuf->mem; - ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; - return 0; -} - -/* Calculate and store the digest of segments */ -static int kexec_calculate_store_digests(struct kimage *image) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; - void *zero_buf; - struct kexec_sha_region *sha_regions; - struct purgatory_info *pi = &image->purgatory_info; - - zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); - zero_buf_sz = PAGE_SIZE; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); - sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) - goto out_free_desc; - - desc->tfm = tfm; - desc->flags = 0; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; - - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } - - for (j = i = 0; i < image->nr_segments; i++) { - struct kexec_segment *ksegment; - - ksegment = &image->segment[i]; - /* - * Skip purgatory as it will be modified once we put digest - * info in purgatory. - */ - if (ksegment->kbuf == pi->purgatory_buf) - continue; - - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; - - /* - * Assume rest of the buffer is filled with zero and - * update digest accordingly. - */ - nullsz = ksegment->memsz - ksegment->bufsz; - while (nullsz) { - unsigned long bytes = nullsz; - - if (bytes > zero_buf_sz) - bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; - nullsz -= bytes; - } - - if (ret) - break; - - sha_regions[j].start = ksegment->mem; - sha_regions[j].len = ksegment->memsz; - j++; - } - - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; - - ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } - -out_free_digest: - kfree(digest); -out_free_sha_regions: - vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: - return ret; -} - -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) -{ - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; - - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; - - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - if (!sechdrs) - return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); - - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - - /* - * Identify entry point section and make entry relative to section - * start. - */ - entry = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - entry -= sechdrs[i].sh_addr; - break; - } - } - - /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; - bss_align = 1; - buf_sz = 0; - bss_sz = 0; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; - } else { - /* bss section */ - if (bss_align < align) - bss_align = align; - bss_sz = ALIGN(bss_sz, align); - bss_sz += sechdrs[i].sh_size; - } - } - - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); - - memsz = buf_sz + bss_pad + bss_sz; - - /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { - ret = -ENOMEM; - goto out; - } - - if (buf_align < bss_align) - buf_align = bss_align; - - /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); - if (ret) - goto out; - - /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { - bss_addr = ALIGN(bss_addr, align); - sechdrs[i].sh_addr = bss_addr; - bss_addr += sechdrs[i].sh_size; - } - } - - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; - - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; - - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; - - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = purgatory_buf; - return ret; -out: - vfree(sechdrs); - vfree(purgatory_buf); - return ret; -} - -static int kexec_apply_relocations(struct kimage *image) -{ - int i, ret; - struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; - - /* Apply relocations */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; - - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) - continue; - - /* - * For section of type SHT_RELA/SHT_REL, - * ->sh_link contains section header index of associated - * symbol table. And ->sh_info contains section header - * index of section to which relocations apply. - */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) - return -ENOEXEC; - - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; - - if (!(section->sh_flags & SHF_ALLOC)) - continue; - - /* - * symtab->sh_link contain section header index of associated - * string table. - */ - if (symtab->sh_link >= pi->ehdr->e_shnum) - /* Invalid section number? */ - continue; - - /* - * Respective architecture needs to provide support for applying - * relocations of type SHT_RELA/SHT_REL. - */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); - if (ret) - return ret; - } - - return 0; -} - -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) -{ - struct purgatory_info *pi = &image->purgatory_info; - int ret; - - if (kexec_purgatory_size <= 0) - return -EINVAL; - - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; - - ret = __kexec_load_purgatory(image, min, max, top_down); - if (ret) - return ret; - - ret = kexec_apply_relocations(image); - if (ret) - goto out; - - *load_addr = pi->purgatory_load_addr; - return 0; -out: - vfree(pi->sechdrs); - vfree(pi->purgatory_buf); - return ret; -} - -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - Elf_Sym *syms; - Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; - int i, k; - const char *strtab; - - if (!pi->sechdrs || !pi->ehdr) - return NULL; - - sechdrs = pi->sechdrs; - ehdr = pi->ehdr; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - -void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) -{ - struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; - Elf_Shdr *sechdr; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return ERR_PTR(-EINVAL); - - sechdr = &pi->sechdrs[sym->st_shndx]; - - /* - * Returns the address where symbol will finally be loaded after - * kexec_load_segment() - */ - return (void *)(sechdr->sh_addr + sym->st_value); -} - -/* - * Get or set value of a symbol. If "get_value" is true, symbol value is - * returned in buf otherwise symbol value is set based on value in buf. - */ -int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, - void *buf, unsigned int size, bool get_value) -{ - Elf_Sym *sym; - Elf_Shdr *sechdrs; - struct purgatory_info *pi = &image->purgatory_info; - char *sym_buf; - - sym = kexec_purgatory_find_symbol(pi, name); - if (!sym) - return -EINVAL; - - if (sym->st_size != size) { - pr_err("symbol %s size mismatch: expected %lu actual %u\n", - name, (unsigned long)sym->st_size, size); - return -EINVAL; - } - - sechdrs = pi->sechdrs; - - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - pr_err("symbol %s is in a bss section. Cannot %s\n", name, - get_value ? "get" : "set"); - return -EINVAL; - } - - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; - - if (get_value) - memcpy((void *)buf, sym_buf, size); - else - memcpy((void *)sym_buf, buf, size); - - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - -/* - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -int kernel_kexec(void) -{ - int error = 0; - - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - if (!kexec_image) { - error = -EINVAL; - goto Unlock; - } - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - lock_system_sleep(); - pm_prepare_console(); - error = freeze_processes(); - if (error) { - error = -EBUSY; - goto Restore_console; - } - suspend_console(); - error = dpm_suspend_start(PMSG_FREEZE); - if (error) - goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. - */ - error = dpm_suspend_end(PMSG_FREEZE); - if (error) - goto Resume_devices; - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - local_irq_disable(); - error = syscore_suspend(); - if (error) - goto Enable_irqs; - } else -#endif - { - kexec_in_progress = true; - kernel_restart_prepare(NULL); - migrate_to_reboot_cpu(); - - /* - * migrate_to_reboot_cpu() disables CPU hotplug assuming that - * no further code needs to use CPU hotplug (which is true in - * the reboot case). However, the kexec path depends on using - * CPU hotplug again; so re-enable it here. - */ - cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); - machine_shutdown(); - } - - machine_kexec(kexec_image); - -#ifdef CONFIG_KEXEC_JUMP - if (kexec_image->preserve_context) { - syscore_resume(); - Enable_irqs: - local_irq_enable(); - Enable_cpus: - enable_nonboot_cpus(); - dpm_resume_start(PMSG_RESTORE); - Resume_devices: - dpm_resume_end(PMSG_RESTORE); - Resume_console: - resume_console(); - thaw_processes(); - Restore_console: - pm_restore_console(); - unlock_system_sleep(); - } -#endif - - Unlock: - mutex_unlock(&kexec_mutex); - return error; -} diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c new file mode 100644 index 000000000000..bd9f8a03cefa --- /dev/null +++ b/kernel/kexec_core.c @@ -0,0 +1,1534 @@ +/* + * kexec.c - kexec system call core code. + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec: " fmt + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/utsname.h> +#include <linux/numa.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/freezer.h> +#include <linux/pm.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/console.h> +#include <linux/vmalloc.h> +#include <linux/swap.h> +#include <linux/syscore_ops.h> +#include <linux/compiler.h> +#include <linux/hugetlb.h> + +#include <asm/page.h> +#include <asm/sections.h> + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include "kexec_internal.h" + +DEFINE_MUTEX(kexec_mutex); + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* Flag to indicate we are going to kexec a new kernel */ +bool kexec_in_progress = false; + + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in do_exit() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); + +int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addresses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + return result; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + return result; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + return result; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + return result; + } + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; +} + +int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + + pages->mapping = NULL; + set_page_private(pages, order); + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page_private(page); + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everything to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + image->control_page = hole_end; + break; + } + } + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unusable pages I have cached */ + kimage_free_page_list(&image->unusable_pages); + +} +void kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return NULL; +} + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return NULL; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unusable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it if it's + * gfp_flags honor the ones passed in. + */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } + addr = old_addr; + page = old_page; + break; + } + /* Place the page on the destination list, to be used later */ + list_add(&page->lru, &image->dest_pages); + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (!page) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + clear_page(ptr); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + size_t ubytes, mbytes; + int result; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + result = 0; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (!page) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +struct kimage *kexec_image; +struct kimage *kexec_crash_image; +int kexec_load_disabled; + +void crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_mutex here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + mutex_unlock(&kexec_mutex); + } +} + +size_t crash_get_memory_size(void) +{ + size_t size = 0; + + mutex_lock(&kexec_mutex); + if (crashk_res.end != crashk_res.start) + size = resource_size(&crashk_res); + mutex_unlock(&kexec_mutex); + return size; +} + +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + unsigned long old_size; + struct resource *ram_res; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + old_size = (end == 0) ? 0 : end - start + 1; + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) { + ret = -ENOMEM; + goto unlock; + } + + start = roundup(start, KEXEC_CRASH_MEM_ALIGN); + end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); + + crash_map_reserved_pages(); + crash_free_reserved_phys_range(end, crashk_res.end); + + if ((start == end) && (crashk_res.parent != NULL)) + release_resource(&crashk_res); + + ram_res->start = end; + ram_res->end = crashk_res.end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->name = "System RAM"; + + crashk_res.end = end - 1; + + insert_resource(&iomem_resource, ram_res); + crash_unmap_reserved_pages(); + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + size_t size, align; + + /* + * crash_notes could be allocated across 2 vmalloc pages when percpu + * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc + * pages are also on 2 continuous physical pages. In this case the + * 2nd part of crash_notes in 2nd page could be lost since only the + * starting address and size of crash_notes are exported through sysfs. + * Here round up the size of crash_notes to the nearest power of two + * and pass it to __alloc_percpu as align value. This can make sure + * crash_notes is allocated inside one physical page. + */ + size = sizeof(note_buf_t); + align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); + + /* + * Break compile if size is bigger than PAGE_SIZE since crash_notes + * definitely will be in 2 pages with that. + */ + BUILD_BUG_ON(size > PAGE_SIZE); + + crash_notes = __alloc_percpu(size, align); + if (!crash_notes) { + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + return -ENOMEM; + } + return 0; +} +subsys_initcall(crash_notes_memory_init); + + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +unsigned long __weak paddr_vmcoreinfo_note(void) +{ + return __pa((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_kexec_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_X86 + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); +#endif +#ifdef CONFIG_HUGETLBFS + VMCOREINFO_SYMBOL(free_huge_page); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + lock_system_sleep(); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Resume_console; + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); + error = syscore_suspend(); + if (error) + goto Enable_irqs; + } else +#endif + { + kexec_in_progress = true; + kernel_restart_prepare(NULL); + migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); + pr_emerg("Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + syscore_resume(); + Enable_irqs: + local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + dpm_resume_start(PMSG_RESTORE); + Resume_devices: + dpm_resume_end(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + unlock_system_sleep(); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +} + +/* + * Add and remove page tables for crashkernel memory + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +void __weak crash_map_reserved_pages(void) +{} + +void __weak crash_unmap_reserved_pages(void) +{} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c new file mode 100644 index 000000000000..6a9a3f2a0e8e --- /dev/null +++ b/kernel/kexec_file.c @@ -0,0 +1,1045 @@ +/* + * kexec: kexec_file_load system call + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <linux/syscalls.h> +#include <linux/vmalloc.h> +#include "kexec_internal.h" + +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + return -EINVAL; +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and command line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: + kfree(image); + return ret; +} + +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + kbuf->mem = temp_start; + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective architecture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h new file mode 100644 index 000000000000..e4392a698ad4 --- /dev/null +++ b/kernel/kexec_internal.h @@ -0,0 +1,22 @@ +#ifndef LINUX_KEXEC_INTERNAL_H +#define LINUX_KEXEC_INTERNAL_H + +#include <linux/kexec.h> + +struct kimage *do_kimage_alloc_init(void); +int sanity_check_segment_list(struct kimage *image); +void kimage_free_page_list(struct list_head *list); +void kimage_free(struct kimage *image); +int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +void kimage_terminate(struct kimage *image); +int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); + +extern struct mutex kexec_mutex; + +#ifdef CONFIG_KEXEC_FILE +void kimage_file_post_load_cleanup(struct kimage *image); +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ +#endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 2777f40a9c7b..0277d1216f80 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ extern int max_threads; -static struct workqueue_struct *khelper_wq; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -114,10 +112,11 @@ out: * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns - * zero on success or a negative errno code on failure. Note that a - * successful module load does not mean the module did not then unload - * and exit on an error of its own. Callers must check that the service - * they requested is now available not blindly invoke it. + * zero on success or a negative errno code or positive exit code from + * "modprobe" on failure. Note that a successful module load does not mean + * the module did not then unload and exit on an error of its own. Callers + * must check that the service they requested is now available not blindly + * invoke it. * * If module auto-loading support is disabled then this function * becomes a no-operation. @@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info) /* * This is the task which runs the usermode application */ -static int ____call_usermodehelper(void *data) +static int call_usermodehelper_exec_async(void *data) { struct subprocess_info *sub_info = data; struct cred *new; @@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock); - /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed_ptr(current, cpu_all_mask); - /* - * Our parent is keventd, which runs with elevated scheduling priority. - * Avoid propagating that into the userspace child. + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. */ set_user_nice(current, 0); @@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data) (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; - /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) @@ -266,15 +265,14 @@ out: do_exit(0); } -/* Keventd can't block, but this (a child) can. */ -static int wait_for_helper(void *data) +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { - struct subprocess_info *sub_info = data; pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; } else { @@ -282,44 +280,64 @@ static int wait_for_helper(void *data) /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. - * But wait_for_helper() always runs as keventd, and put_user() - * to a kernel address works OK for kernel threads, due to their - * having an mm_segment_t which spans the entire address space. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. * * Thus the __user pointer cast is valid here. */ sys_wait4(pid, (int __user *)&ret, 0, NULL); /* - * If ret is 0, either ____call_usermodehelper failed and the - * real error code is already in sub_info->retval or + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or * sub_info->retval is 0 anyway, so don't mess with it then. */ if (ret) sub_info->retval = ret; } + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + umh_complete(sub_info); - do_exit(0); } -/* This is run by khelper thread */ -static void __call_usermodehelper(struct work_struct *work) +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - pid_t pid; - if (sub_info->wait & UMH_WAIT_PROC) - pid = kernel_thread(wait_for_helper, sub_info, - CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, - SIGCHLD); - - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } } } @@ -509,7 +527,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, if (!sub_info) goto out; - INIT_WORK(&sub_info->work, __call_usermodehelper); + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; @@ -531,8 +549,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * from interrupt context. * * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of keventd. - * (ie. it runs with full root capabilities). + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { @@ -544,7 +562,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) return -EINVAL; } helper_lock(); - if (!khelper_wq || usermodehelper_disabled) { + if (usermodehelper_disabled) { retval = -EBUSY; goto out; } @@ -556,7 +574,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; - queue_work(khelper_wq, &sub_info->work); + queue_work(system_unbound_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; @@ -686,9 +704,3 @@ struct ctl_table usermodehelper_table[] = { }, { } }; - -void __init usermodehelper_init(void) -{ - khelper_wq = create_singlethread_workqueue("khelper"); - BUG_ON(!khelper_wq); -} diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417bb963..d10ab6b9b5e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6683ccef9fff..e83b26464061 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_KEXEC_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 10e489c448fe..9ff173dca1ae 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -97,6 +97,7 @@ bool kthread_should_park(void) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); } +EXPORT_SYMBOL_GPL(kthread_should_park); /** * kthread_freezable_should_stop - should this freezable kthread return now? @@ -171,6 +172,7 @@ void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } +EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { @@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create) * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. - * @node: memory node number. + * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give -1. + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which no one will call kthread_stop(), or @@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** @@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k) if (kthread) __kthread_unpark(k, kthread); } +EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). @@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k) } return ret; } +EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c40ebcca0495..6e5344112419 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func) { struct klp_ops *ops; - WARN_ON(func->state != KLP_ENABLED); - WARN_ON(!func->old_addr); + if (WARN_ON(func->state != KLP_ENABLED)) + return; + if (WARN_ON(!func->old_addr)) + return; ops = klp_find_ops(func->old_addr); if (WARN_ON(!ops)) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..8e96f6cc2a4a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf773e06..4e49cc4c9952 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references) + int references, int pin_count) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif - hlock->pin_count = 0; + hlock->pin_count = pin_count; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3343,7 +3343,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3433,7 +3433,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 32244186f1f2..8ef1919d63b2 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -17,12 +17,14 @@ * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/rwlock.h> #include <linux/mutex.h> @@ -34,6 +36,7 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -91,11 +94,13 @@ struct lock_torture_ops { void (*init)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); + void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(void); - unsigned long flags; + + unsigned long flags; /* for irq spinlocks */ const char *name; }; @@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void) /* BUGGY, do not use in real life!!! */ } +static void torture_boost_dummy(struct torture_random_state *trsp) +{ + /* Only rtmutexes care about priority */ +} + static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -211,6 +223,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -315,6 +329,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#ifdef CONFIG_RT_MUTEXES +static DEFINE_RT_MUTEX(torture_rtmutex); + +static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; +} + +static void torture_rtmutex_boost(struct torture_random_state *trsp) +{ + int policy; + struct sched_param param; + const unsigned int factor = 50000; /* yes, quite arbitrary */ + + if (!rt_task(current)) { + /* + * (1) Boost priority once every ~50k operations. When the + * task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + policy = SCHED_FIFO; + param.sched_priority = MAX_RT_PRIO - 1; + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another ~500k operations, + * then restored back to its original prio, and so forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + policy = SCHED_NORMAL; + param.sched_priority = 0; + } else /* common case, do nothing */ + return; + } + + sched_setscheduler_nocheck(current, policy, ¶m); +} + +static void torture_rtmutex_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* + * We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +{ + rt_mutex_unlock(&torture_rtmutex); +} + +static struct lock_torture_ops rtmutex_lock_ops = { + .writelock = torture_rtmutex_lock, + .write_delay = torture_rtmutex_delay, + .task_boost = torture_rtmutex_boost, + .writeunlock = torture_rtmutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "rtmutex_lock" +}; +#endif + static DECLARE_RWSEM(torture_rwsem); static int torture_rwsem_down_write(void) __acquires(torture_rwsem) { @@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = { .name = "rwsem_lock" }; +#include <linux/percpu-rwsem.h> +static struct percpu_rw_semaphore pcpu_rwsem; + +void torture_percpu_rwsem_init(void) +{ + BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); +} + +static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +{ + percpu_down_write(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +{ + percpu_up_write(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +{ + percpu_down_read(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +{ + percpu_up_read(&pcpu_rwsem); +} + +static struct lock_torture_ops percpu_rwsem_lock_ops = { + .init = torture_percpu_rwsem_init, + .writelock = torture_percpu_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_percpu_rwsem_up_write, + .readlock = torture_percpu_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_percpu_rwsem_up_read, + .name = "percpu_rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->task_boost(&rand); cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; @@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg) stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); + + cxt.cur_ops->task_boost(NULL); /* reset prio */ torture_kthread_stopping("lock_torture_writer"); return 0; } @@ -642,7 +788,11 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, +#ifdef CONFIG_RT_MUTEXES + &rtmutex_lock_ops, +#endif &rwsem_lock_ops, + &percpu_rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -661,11 +811,11 @@ static int __init lock_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cxt.cur_ops->init) - cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cxt.cur_ops->init(); if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; @@ -676,6 +826,10 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "mutex", 5) == 0) cxt.debug_lock = true; #endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (strncmp(torture_type, "rtmutex", 7) == 0) + cxt.debug_lock = true; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if ((strncmp(torture_type, "spin", 4) == 0) || (strncmp(torture_type, "rw_lock", 7) == 0)) diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index fd91aaa4554c..5b9102a47ea5 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg(lock, node); + prev = xchg_acquire(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads @@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) /* * Release the lock by setting it to NULL */ - if (likely(cmpxchg(lock, node, NULL) == node)) + if (likely(cmpxchg_release(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4cccea6b8934..0551c219c40e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) static inline bool mutex_try_to_acquire(struct mutex *lock) { return !mutex_is_locked(lock) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1); + (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); } /* @@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * Once more, try to acquire the lock. Only try-lock the mutex if * it is unlocked to reduce unnecessary xchg() operations. */ - if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) + if (!mutex_is_locked(lock) && + (atomic_xchg_acquire(&lock->count, 0) == 1)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * non-negative in order to avoid unnecessary xchg operations: */ if (atomic_read(&lock->count) >= 0 && - (atomic_xchg(&lock->count, -1) == 1)) + (atomic_xchg_acquire(&lock->count, -1) == 1)) break; /* @@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) spin_lock_mutex(&lock->wait_lock, flags); - prev = atomic_xchg(&lock->count, -1); + prev = atomic_xchg_acquire(&lock->count, -1); if (likely(prev == 1)) { mutex_set_owner(lock); mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index dc85ee23a26f..d092a0c9c2d4 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) node->next = NULL; node->cpu = curr; - old = atomic_xchg(&lock->tail, curr); + /* + * ACQUIRE semantics, pairs with corresponding RELEASE + * in unlock() uncontended, or fastpath. + */ + old = atomic_xchg_acquire(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; @@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Fast path for the uncontended case. */ - if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + if (likely(atomic_cmpxchg_release(&lock->tail, curr, + OSQ_UNLOCKED_VAL) == curr)) return; /* diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f231e0bb311c 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); + rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); return 0; } +EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { + /* + * XXX: temporary kludge. The error path in alloc_super() + * assumes that percpu_free_rwsem() is safe after kzalloc(). + */ + if (!brw->fast_read_ctr) + return; + + rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } /* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer + * This is the fast-path for down_read/up_read. If it succeeds we rely + * on the barriers provided by rcu_sync_enter/exit; see the comments in + * percpu_down_write() and percpu_up_write(). * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { - bool success = false; + bool success; preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { + success = rcu_sync_is_idle(&brw->rss); + if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } preempt_enable(); return success; @@ -77,16 +70,30 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + + if (likely(update_fast_ctr(brw, +1))) return; - } - down_read(&brw->rw_sem); + /* Avoid rwsem_acquire_read() and rwsem_release() */ + __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } +EXPORT_SYMBOL_GPL(percpu_down_read); + +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} void percpu_up_read(struct percpu_rw_semaphore *brw) { @@ -99,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); } +EXPORT_SYMBOL_GPL(percpu_up_read); static int clear_fast_ctr(struct percpu_rw_semaphore *brw) { @@ -113,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) return sum; } -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ void percpu_down_write(struct percpu_rw_semaphore *brw) { - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). + * Make rcu_sync_is_idle() == F and thus disable the fast-path in + * percpu_down_read() and percpu_up_read(), and wait for gp pass. * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). + * The latter synchronises us with the preceding readers which used + * the fast-past, so we can not miss the result of __this_cpu_add() + * or anything else inside their criticial sections. */ - synchronize_sched_expedited(); + rcu_sync_enter(&brw->rss); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); @@ -150,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); } +EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). + * Enable the fast-path in percpu_down_read() and percpu_up_read() + * but only after another gp pass; this adds the necessary barrier + * to ensure the reader can't miss the changes done by us. */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); + rcu_sync_exit(&brw->rss); } +EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..fec082338668 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } @@ -84,42 +86,37 @@ void queue_read_lock_slowpath(struct qrwlock *lock) /* * Put the reader into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* * Signal the next one in queue to become queue head */ - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; /* Put the writer into the wait queue */ - arch_spin_lock(&lock->lock); + arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) struct __qrwlock *l = (struct __qrwlock *)lock; if (!READ_ONCE(l->wmode) && - (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -140,13 +137,13 @@ void queue_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); } unlock: - arch_spin_unlock(&lock->lock); + arch_spin_unlock(&lock->wait_lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..87e9ce6a63c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (pv_enabled()) goto queue; - if (virt_queued_spin_lock(lock)) + if (virt_spin_lock(lock)) return; /* @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_kick_node(lock, next); release: /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab18151cc8..f0450ff4829b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,6 +4,7 @@ #include <linux/hash.h> #include <linux/bootmem.h> +#include <linux/debug_locks.h> /* * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead @@ -21,9 +22,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. * - * This should be fine however, kicking people for no reason is - * harmless. + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. * - * See the comment in pv_wait_node(). + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct qspinlock **lp = NULL; int loop; + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + for (;;) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) @@ -239,17 +266,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); + /* - * lp must be set before setting _Q_SLOW_VAL + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. * - * [S] lp = lock [RmW] l = l->locked = 0 - * MB MB - * [S] l->locked = _Q_SLOW_VAL [L] lp + * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> * - * Matches the cmpxchg() in __pv_queued_spin_unlock(). + * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { /* @@ -286,14 +315,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; + u8 locked; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); return; + } + + /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); /* * Since the above failed to release, this must be the SLOW path. @@ -310,9 +357,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) - pv_kick(node->cpu); + pv_kick(node->cpu); } /* * Include the architecture specific callee-save thunk of the diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * - */ -#include <linux/device.h> -#include <linux/kthread.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/sched/rt.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/freezer.h> -#include <linux/stat.h> - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..8251e75dd9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) + +/* + * Callers must hold the ->wait_lock -- which is the whole purpose as we force + * all future threads that attempt to [Rmw] the lock to the slowpath. As such + * relaxed semantics suffice. + */ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; do { owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); + } while (cmpxchg_relaxed(p, owner, + owner | RT_MUTEX_HAS_WAITERS) != owner); } /* @@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) * lock(wait_lock); * acquire(lock); */ - return rt_mutex_cmpxchg(lock, owner, NULL); + return rt_mutex_cmpxchg_release(lock, owner, NULL); } #else -# define rt_mutex_cmpxchg(l,c,n) (0) +# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) +# define rt_mutex_cmpxchg_acquire(l,c,n) (0) +# define rt_mutex_cmpxchg_release(l,c,n) (0) + static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) @@ -158,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return (left->task->dl.deadline < right->task->dl.deadline); + return dl_time_before(left->task->dl.deadline, + right->task->dl.deadline); return 0; } @@ -1120,7 +1133,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -1321,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1337,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg(lock, NULL, current))) { + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 0; } else @@ -1348,7 +1361,7 @@ static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { rt_mutex_deadlock_account_lock(lock, current); return 1; } @@ -1362,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); } else { @@ -1484,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh) { - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); return false; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,28 +15,6 @@ #include <linux/rtmutex.h> /* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0f189714e457..a4d4de05b2d1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) * to reduce unnecessary expensive cmpxchg() operations. */ if (count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { if (!list_is_singular(&sem->wait_list)) rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + old = cmpxchg_acquire(&sem->count, count, + count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); return true; diff --git a/kernel/membarrier.c b/kernel/membarrier.c new file mode 100644 index 000000000000..536c727a56e9 --- /dev/null +++ b/kernel/membarrier.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/syscalls.h> +#include <linux/membarrier.h> + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, or if the command argument is invalid, + * this system call returns -EINVAL. For a given command, with flags argument + * set to 0, this system call is guaranteed to always return the same value + * until reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + return MEMBARRIER_CMD_BITMASK; + case MEMBARRIER_CMD_SHARED: + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/memremap.c b/kernel/memremap.c new file mode 100644 index 000000000000..9d6b55587eaa --- /dev/null +++ b/kernel/memremap.c @@ -0,0 +1,200 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/memory_hotplug.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size) +{ + struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + + /* In the simple case just return the existing linear address */ + if (!PageHighMem(page)) + return __va(offset); + return NULL; /* fallback to ioremap_cache */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: either MEMREMAP_WB or MEMREMAP_WT + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. + * + * MEMREMAP_WB - matches the default mapping for "System RAM" on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map "System RAM" with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, "System RAM"); + void *addr = NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + flags &= ~MEMREMAP_WB; + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in "System RAM" + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size); + if (!addr) + addr = ioremap_cache(offset, size); + } + + /* + * If we don't have a mapping yet and more request flags are + * pending then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * "System RAM" + */ + if (!addr && is_ram == REGION_INTERSECTS && flags) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) { + flags &= ~MEMREMAP_WT; + addr = ioremap_wt(offset, size); + } + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else + devres_free(ptr); + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, + addr)); + memunmap(addr); +} +EXPORT_SYMBOL(devm_memunmap); + +#ifdef CONFIG_ZONE_DEVICE +struct page_map { + struct resource res; +}; + +static void devm_memremap_pages_release(struct device *dev, void *res) +{ + struct page_map *page_map = res; + + /* pages are dead and unused, undo the arch mapping */ + arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); +} + +void *devm_memremap_pages(struct device *dev, struct resource *res) +{ + int is_ram = region_intersects(res->start, resource_size(res), + "System RAM"); + struct page_map *page_map; + int error, nid; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "%s attempted on mixed region %pr\n", + __func__, res); + return ERR_PTR(-ENXIO); + } + + if (is_ram == REGION_INTERSECTS) + return __va(res->start); + + page_map = devres_alloc(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL); + if (!page_map) + return ERR_PTR(-ENOMEM); + + memcpy(&page_map->res, res, sizeof(*res)); + + nid = dev_to_node(dev); + if (nid < 0) + nid = 0; + + error = arch_add_memory(nid, res->start, resource_size(res), true); + if (error) { + devres_free(page_map); + return ERR_PTR(error); + } + + devres_add(dev, page_map); + return __va(res->start); +} +EXPORT_SYMBOL(devm_memremap_pages); +#endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 4d2b82e610e2..8f051a106676 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name, } EXPORT_SYMBOL_GPL(find_symbol); -/* Search for module by name: must hold module_mutex. */ +/* + * Search for module by name: must hold module_mutex (or preempt disabled + * for read-only access). + */ static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex(); + module_assert_mutex_or_preempt(); list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len, struct module *find_module(const char *name) { + module_assert_mutex(); return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -1059,11 +1063,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index be5b8fac4bd0..6528a79d998d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -10,11 +10,9 @@ */ #include <linux/kernel.h> -#include <linux/err.h> -#include <crypto/public_key.h> -#include <crypto/hash.h> -#include <keys/asymmetric-type.h> +#include <linux/errno.h> #include <keys/system_keyring.h> +#include <crypto/public_key.h> #include "module-internal.h" /* @@ -28,170 +26,22 @@ * - Information block */ struct module_signature { - u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum hash_algo] */ - u8 id_type; /* Key identifier type [enum pkey_id_type] */ - u8 signer_len; /* Length of signer's name */ - u8 key_id_len; /* Length of key identifier */ + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ u8 __pad[3]; __be32 sig_len; /* Length of signature data */ }; /* - * Digest the module contents. - */ -static struct public_key_signature *mod_make_digest(enum hash_algo hash, - const void *mod, - unsigned long modlen) -{ - struct public_key_signature *pks; - struct crypto_shash *tfm; - struct shash_desc *desc; - size_t digest_size, desc_size; - int ret; - - pr_devel("==>%s()\n", __func__); - - /* Allocate the hashing algorithm we're going to need and find out how - * big the hash operational data will be. - */ - tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); - if (IS_ERR(tfm)) - return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - digest_size = crypto_shash_digestsize(tfm); - - /* We allocate the hash operational data storage on the end of our - * context data and the digest output buffer on the end of that. - */ - ret = -ENOMEM; - pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); - if (!pks) - goto error_no_pks; - - pks->pkey_hash_algo = hash; - pks->digest = (u8 *)pks + sizeof(*pks) + desc_size; - pks->digest_size = digest_size; - - desc = (void *)pks + sizeof(*pks); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto error; - - ret = crypto_shash_finup(desc, mod, modlen, pks->digest); - if (ret < 0) - goto error; - - crypto_free_shash(tfm); - pr_devel("<==%s() = ok\n", __func__); - return pks; - -error: - kfree(pks); -error_no_pks: - crypto_free_shash(tfm); - pr_devel("<==%s() = %d\n", __func__, ret); - return ERR_PTR(ret); -} - -/* - * Extract an MPI array from the signature data. This represents the actual - * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the - * size of the MPI in bytes. - * - * RSA signatures only have one MPI, so currently we only read one. - */ -static int mod_extract_mpi_array(struct public_key_signature *pks, - const void *data, size_t len) -{ - size_t nbytes; - MPI mpi; - - if (len < 3) - return -EBADMSG; - nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; - data += 2; - len -= 2; - if (len != nbytes) - return -EBADMSG; - - mpi = mpi_read_raw_data(data, nbytes); - if (!mpi) - return -ENOMEM; - pks->mpi[0] = mpi; - pks->nr_mpi = 1; - return 0; -} - -/* - * Request an asymmetric key. - */ -static struct key *request_asymmetric_key(const char *signer, size_t signer_len, - const u8 *key_id, size_t key_id_len) -{ - key_ref_t key; - size_t i; - char *id, *q; - - pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); - - /* Construct an identifier. */ - id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); - if (!id) - return ERR_PTR(-ENOKEY); - - memcpy(id, signer, signer_len); - - q = id + signer_len; - *q++ = ':'; - *q++ = ' '; - for (i = 0; i < key_id_len; i++) { - *q++ = hex_asc[*key_id >> 4]; - *q++ = hex_asc[*key_id++ & 0x0f]; - } - - *q = 0; - - pr_debug("Look up: \"%s\"\n", id); - - key = keyring_search(make_key_ref(system_trusted_keyring, 1), - &key_type_asymmetric, id); - if (IS_ERR(key)) - pr_warn("Request for unknown module key '%s' err %ld\n", - id, PTR_ERR(key)); - kfree(id); - - if (IS_ERR(key)) { - switch (PTR_ERR(key)) { - /* Hide some search errors */ - case -EACCES: - case -ENOTDIR: - case -EAGAIN: - return ERR_PTR(-ENOKEY); - default: - return ERR_CAST(key); - } - } - - pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); - return key_ref_to_ptr(key); -} - -/* * Verify the signature on a module. */ int mod_verify_sig(const void *mod, unsigned long *_modlen) { - struct public_key_signature *pks; struct module_signature ms; - struct key *key; - const void *sig; size_t modlen = *_modlen, sig_len; - int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -205,46 +55,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - if ((size_t)ms.signer_len + ms.key_id_len >= modlen) - return -EBADMSG; - modlen -= (size_t)ms.signer_len + ms.key_id_len; - *_modlen = modlen; - sig = mod + modlen; - - /* For the moment, only support RSA and X.509 identifiers */ - if (ms.algo != PKEY_ALGO_RSA || - ms.id_type != PKEY_ID_X509) - return -ENOPKG; - if (ms.hash >= PKEY_HASH__LAST || - !hash_algo_name[ms.hash]) + if (ms.id_type != PKEY_ID_PKCS7) { + pr_err("Module is not signed with expected PKCS#7 message\n"); return -ENOPKG; - - key = request_asymmetric_key(sig, ms.signer_len, - sig + ms.signer_len, ms.key_id_len); - if (IS_ERR(key)) - return PTR_ERR(key); - - pks = mod_make_digest(ms.hash, mod, modlen); - if (IS_ERR(pks)) { - ret = PTR_ERR(pks); - goto error_put_key; } - ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, - sig_len); - if (ret < 0) - goto error_free_pks; - - ret = verify_signature(key, pks); - pr_devel("verify_signature() = %d\n", ret); + if (ms.algo != 0 || + ms.hash != 0 || + ms.signer_len != 0 || + ms.key_id_len != 0 || + ms.__pad[0] != 0 || + ms.__pad[1] != 0 || + ms.__pad[2] != 0) { + pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + return -EBADMSG; + } -error_free_pks: - mpi_free(pks->rsa.s); - kfree(pks); -error_put_key: - key_put(key); - pr_devel("<==%s() = %d\n", __func__, ret); - return ret; + return system_verify_data(mod, modlen, mod + modlen, sig_len, + VERIFYING_MODULE_SIGNATURE); } diff --git a/kernel/notifier.c b/kernel/notifier.c index ae9fc7cc360e..fd2c9acbcc19 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str, .signr = sig, }; + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); diff --git a/kernel/pid.c b/kernel/pid.c index 4fd07d5b7baf..ca368793808e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9e302315e33d..02e8dfaa1ce2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,6 +18,16 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config SUSPEND_SKIP_SYNC + bool "Skip kernel's sys_sync() on suspend to RAM/standby" + depends on SUSPEND + depends on EXPERT + help + Skip the kernel sys_sync() before freezing user processes. + Some systems prefer not to pay this cost on every invocation + of suspend, or they are content with invoking sync() from + user-space before invoking suspend. Say Y if that's your case. + config HIBERNATE_CALLBACKS bool diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 690f78f210f2..b7342a24f559 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -733,7 +733,7 @@ int hibernate(void) * contents of memory is restored from the saved image. * * If this is successful, control reappears in the restored target kernel in - * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ diff --git a/kernel/power/main.c b/kernel/power/main.c index 63d395b5df93..b2dd4d999900 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -272,6 +272,22 @@ static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } + +static ssize_t pm_wakeup_irq_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; +} + +static ssize_t pm_wakeup_irq_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} +power_attr(pm_wakeup_irq); + #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ @@ -604,6 +620,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_print_times_attr.attr, + &pm_wakeup_irq_attr.attr, #endif #endif #ifdef CONFIG_FREEZER diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 53266b729fd9..f9fe133c13e2 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -35,6 +35,9 @@ const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; +unsigned int pm_suspend_global_flags; +EXPORT_SYMBOL_GPL(pm_suspend_global_flags); + static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); @@ -484,13 +487,16 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); +#ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); +#endif pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); + pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) goto Unlock; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2f30ca91e4fa..b2066fb5b10f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb) hb->error = 0; } -static void hib_end_io(struct bio *bio, int error) +static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate || error) { + if (bio->bi_error) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_iter.bi_sector); - - if (!error) - error = -EIO; } if (bio_data_dir(bio) == WRITE) put_page(page); - if (error && !hb->error) - hb->error = error; + if (bio->bi_error && !hb->error) + hb->error = bio->bi_error; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 019069c84ff6..1896386e16bb 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -17,6 +17,7 @@ #include <linux/list.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include "power.h" @@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {} #define WL_GC_COUNT_MAX 100 #define WL_GC_TIME_SEC 300 +static void __wakelocks_gc(struct work_struct *work); static LIST_HEAD(wakelocks_lru_list); +static DECLARE_WORK(wakelock_work, __wakelocks_gc); static unsigned int wakelocks_gc_count; static inline void wakelocks_lru_add(struct wakelock *wl) @@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl) list_move(&wl->lru, &wakelocks_lru_list); } -static void wakelocks_gc(void) +static void __wakelocks_gc(struct work_struct *work) { struct wakelock *wl, *aux; ktime_t now; - if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) - return; + mutex_lock(&wakelocks_lock); now = ktime_get(); list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { @@ -127,6 +129,16 @@ static void wakelocks_gc(void) } } wakelocks_gc_count = 0; + + mutex_unlock(&wakelocks_lock); +} + +static void wakelocks_gc(void) +{ + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + schedule_work(&wakelock_work); } #else /* !CONFIG_PM_WAKELOCKS_GC */ static inline void wakelocks_lru_add(struct wakelock *wl) {} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cf8c24203368..b16f35487b67 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -517,6 +517,7 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } +EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { @@ -835,7 +836,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * This appends the listed symbols to /proc/vmcore * diff --git a/kernel/profile.c b/kernel/profile.c index a7bcd28d6e9f..99513e1160e5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info, node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -543,14 +543,14 @@ static int create_hash_tables(void) int node = cpu_to_mem(cpu); struct page *page; - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_exact_node(node, + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..b760bae64cf1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); @@ -1003,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request, break; } #endif + + case PTRACE_SECCOMP_GET_FILTER: + ret = seccomp_get_filter(child, addr, datavp); + break; + default: break; } diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 50a808424b06..61a16569ffbf 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,4 @@ -obj-y += update.o +obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e32684c23b..d89328e260df 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -252,7 +252,7 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_state)(void); void (*cond_sync)(unsigned long oldstate); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); @@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void) } static void -call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +call_rcu_busted(struct rcu_head *head, rcu_callback_t func) { /* This is a deliberate bug for testing purposes only! */ func(head); @@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void) } static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { call_srcu(srcu_ctlp, head, func); } @@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, + .get_state = get_state_synchronize_sched, + .cond_sync = cond_synchronize_sched, .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, @@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = { #define RCUTORTURE_TASKS_OPS &tasks_ops, +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + #else /* #ifdef CONFIG_TASKS_RCU */ #define RCUTORTURE_TASKS_OPS +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + #endif /* #else #ifdef CONFIG_TASKS_RCU */ /* @@ -756,7 +768,6 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -823,9 +834,7 @@ rcu_torture_cbflood(void *arg) } if (err) { VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - while (!torture_must_stop()) - schedule_timeout_interruptible(HZ); - return 0; + goto wait_for_stop; } VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); do { @@ -844,6 +853,7 @@ rcu_torture_cbflood(void *arg) stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); vfree(rhp); +wait_for_stop: torture_kthread_stopping("rcu_torture_cbflood"); return 0; } @@ -1088,7 +1098,8 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1162,7 +1173,8 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1195,7 +1207,6 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1507,7 +1518,7 @@ static int rcu_torture_barrier_init(void) int i; int ret; - if (n_barrier_cbs == 0) + if (n_barrier_cbs <= 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { pr_alert("%s" TORTURE_FLAG @@ -1729,15 +1740,15 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cur_ops->init(); if (nreaders >= 0) { nrealreaders = nreaders; @@ -1786,12 +1797,15 @@ rcu_torture_init(void) writer_task); if (firsterr) goto unwind; - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nfakewriters > 0) { + fakewriter_tasks = kzalloc(nfakewriters * + sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, @@ -1818,7 +1832,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - if (test_no_idle_hz) { + if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); if (firsterr) goto unwind; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index fb33d35ee0b7..a63a1ea5a41b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) } /** - * srcu_readers_active - returns approximate number of readers. + * srcu_readers_active - returns true if there are readers. and false + * otherwise * @sp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static int srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *sp) { int cpu; unsigned long sum = 0; @@ -297,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - preempt_disable(); __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ __this_cpu_inc(sp->per_cpu_ref->seq[idx]); - preempt_enable(); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -386,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp) * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { unsigned long flags; @@ -414,11 +413,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) struct rcu_head *head = &rcu.head; bool done = false; - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); might_sleep(); init_completion(&rcu.completion); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c new file mode 100644 index 000000000000..be922c9f3d37 --- /dev/null +++ b/kernel/rcu/sync.c @@ -0,0 +1,223 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov <oleg@redhat.com> + */ + +#include <linux/rcu_sync.h> +#include <linux/sched.h> + +#ifdef CONFIG_PROVE_RCU +#define __INIT_HELD(func) .held = func, +#else +#define __INIT_HELD(func) +#endif + +static const struct { + void (*sync)(void); + void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); + void (*wait)(void); +#ifdef CONFIG_PROVE_RCU + int (*held)(void); +#endif +} gp_ops[] = { + [RCU_SYNC] = { + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, + __INIT_HELD(rcu_read_lock_held) + }, + [RCU_SCHED_SYNC] = { + .sync = synchronize_sched, + .call = call_rcu_sched, + .wait = rcu_barrier_sched, + __INIT_HELD(rcu_read_lock_sched_held) + }, + [RCU_BH_SYNC] = { + .sync = synchronize_rcu_bh, + .call = call_rcu_bh, + .wait = rcu_barrier_bh, + __INIT_HELD(rcu_read_lock_bh_held) + }, +}; + +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; + +#define rss_lock gp_wait.lock + +#ifdef CONFIG_PROVE_RCU +void rcu_sync_lockdep_assert(struct rcu_sync *rsp) +{ + RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), + "suspicious rcu_sync_is_idle() usage"); +} +#endif + +/** + * rcu_sync_init() - Initialize an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be initialized + * @type: Flavor of RCU with which to synchronize rcu_sync structure + */ +void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +{ + memset(rsp, 0, sizeof(*rsp)); + init_waitqueue_head(&rsp->gp_wait); + rsp->gp_type = type; +} + +/** + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + bool need_wait, need_sync; + + spin_lock_irq(&rsp->rss_lock); + need_wait = rsp->gp_count++; + need_sync = rsp->gp_state == GP_IDLE; + if (need_sync) + rsp->gp_state = GP_PENDING; + spin_unlock_irq(&rsp->rss_lock); + + BUG_ON(need_wait && need_sync); + + if (need_sync) { + gp_ops[rsp->gp_type].sync(); + rsp->gp_state = GP_PASSED; + wake_up_all(&rsp->gp_wait); + } else if (need_wait) { + wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); + } else { + /* + * Possible when there's a pending CB from a rcu_sync_exit(). + * Nobody has yet been allowed the 'fast' path and thus we can + * avoid doing any sync(). The callback will get 'dropped'. + */ + BUG_ON(rsp->gp_state != GP_PASSED); + } +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is passed to one of the call_rcu() functions by + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of rcu_sync_exit(). It takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rcu) +{ + struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + unsigned long flags; + + BUG_ON(rsp->gp_state != GP_PASSED); + BUG_ON(rsp->cb_state == CB_IDLE); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * A new rcu_sync_begin() has happened; drop the callback. + */ + rsp->cb_state = CB_IDLE; + } else if (rsp->cb_state == CB_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback + * to catch a later GP. + */ + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else { + /* + * We're at least a GP after rcu_sync_exit(); eveybody will now + * have observed the write side critical section. Let 'em rip!. + */ + rsp->cb_state = CB_IDLE; + rsp->gp_state = GP_IDLE; + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who have completed, and can therefore + * now allow readers to make use of their fastpaths after a grace period + * has elapsed. After this grace period has completed, all subsequent + * calls to rcu_sync_is_idle() will return true, which tells readers that + * they can once again use their fastpaths. + */ +void rcu_sync_exit(struct rcu_sync *rsp) +{ + spin_lock_irq(&rsp->rss_lock); + if (!--rsp->gp_count) { + if (rsp->cb_state == CB_IDLE) { + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else if (rsp->cb_state == CB_PENDING) { + rsp->cb_state = CB_REPLAY; + } + } + spin_unlock_irq(&rsp->rss_lock); +} + +/** + * rcu_sync_dtor() - Clean up an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be cleaned up + */ +void rcu_sync_dtor(struct rcu_sync *rsp) +{ + int cb_state; + + BUG_ON(rsp->gp_count); + + spin_lock_irq(&rsp->rss_lock); + if (rsp->cb_state == CB_REPLAY) + rsp->cb_state = CB_PENDING; + cb_state = rsp->cb_state; + spin_unlock_irq(&rsp->rss_lock); + + if (cb_state != CB_IDLE) { + gp_ops[rsp->gp_type].wait(); + BUG_ON(rsp->cb_state != CB_IDLE); + } +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c291bd65d2cb..944b1b491ed8 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -44,7 +44,7 @@ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp); #include "tiny_plugin.h" @@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * Helper function for call_rcu() and call_rcu_bh(). */ static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp) { unsigned long flags; @@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head, * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_ctrlblk); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * Post an RCU bottom-half callback to be invoked after any subsequent * quiescent state. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_ctrlblk); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..f07343b54fe5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -96,7 +97,7 @@ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .rda = &sname##_data, \ .call = cr, \ - .fqs_state = RCU_GP_IDLE, \ + .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ @@ -124,13 +125,8 @@ module_param(rcu_fanout_exact, bool, 0444); static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* @@ -164,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -248,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + unsigned long flags; + + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), + true); + } + local_irq_restore(flags); } } void rcu_bh_qs(void) { - if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } } @@ -340,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void) */ void rcu_note_context_switch(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -356,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * */ void rcu_all_qs(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_qs_ctr); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -649,12 +668,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user) * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /* @@ -701,7 +720,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -714,7 +733,7 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -828,7 +847,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -839,7 +858,7 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -978,9 +997,9 @@ bool notrace rcu_is_watching(void) { bool ret; - preempt_disable(); + preempt_disable_notrace(); ret = __rcu_is_watching(); - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); @@ -1178,9 +1197,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, rsp->gp_flags); + rsp->gpnum, rsp->completed, + rsp->gp_flags, rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : 0); } /* @@ -1745,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->passed_quiesce = 0; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } @@ -1906,18 +1927,37 @@ static int rcu_gp_init(struct rcu_state *rsp) } /* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Someone like call_rcu() requested a force-quiescent-state scan. */ + *gfp = READ_ONCE(rsp->gp_flags); + if (*gfp & RCU_GP_FLAG_FQS) + return true; + + /* The current grace period has completed. */ + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + return true; + + return false; +} + +/* * Do one round of quiescent-state forcing. */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - int fqs_state = fqs_state_in; bool isidle = false; unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { + if (first_time) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { isidle = true; @@ -1926,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); rcu_sysidle_report_gp(rsp, isidle, maxj); - fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ isidle = true; @@ -1940,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); } - return fqs_state; } /* @@ -2004,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); - rsp->fqs_state = RCU_GP_IDLE; + rsp->gp_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; @@ -2022,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { - int fqs_state; + bool first_gp_fqs; int gf; unsigned long j; int ret; @@ -2041,6 +2079,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; @@ -2053,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; + first_gp_fqs = true; j = jiffies_till_first_fqs; if (j > HZ) { j = HZ; @@ -2068,11 +2107,8 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = READ_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); + rcu_gp_fqs_check_wake(rsp, &gf), j); + rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2084,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsstart")); - fqs_state = rcu_gp_fqs(rsp, fqs_state); + rcu_gp_fqs(rsp, first_gp_fqs); + first_gp_fqs = false; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); @@ -2110,7 +2147,9 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle grace-period end. */ + rsp->gp_state = RCU_GP_CLEANUP; rcu_gp_cleanup(rsp); + rsp->gp_state = RCU_GP_CLEANED; } } @@ -2318,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if ((rdp->passed_quiesce == 0 && + if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpwrap) { @@ -2329,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * We will instead need a new quiescent state that lies * within the current grace period. */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2338,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->qs_pending = 0; + rdp->core_needs_qs = 0; /* * This GP can't end until cpu checks in, so all of our @@ -2369,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Does this CPU still need to do its part for current grace period? * If no, return and let the other CPUs do their part as well. */ - if (!rdp->qs_pending) + if (!rdp->core_needs_qs) return; /* * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce && + if (rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; @@ -2998,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), +__call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; @@ -3069,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* * Queue an RCU-sched callback for invocation after a grace period. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_state, -1, 0); } @@ -3078,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Queue an RCU callback for invocation after a quicker grace period. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_state, -1, 0); } @@ -3092,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * function may only be called from __kfree_rcu(). */ void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) + rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 1); } @@ -3161,10 +3200,10 @@ static inline int rcu_blocking_is_gp(void) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3188,10 +3227,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3253,190 +3292,574 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -static int synchronize_sched_expedited_cpu_stop(void *data) +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void) { /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. */ - smp_mb(); /* See above comment block. */ - return 0; + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_sched() + * and cond_synchronize_sched(). + */ + return smp_load_acquire(&rcu_sched_state.gpnum); } +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); /** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. + * @oldstate: return value from earlier call to get_state_synchronize_sched() * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return. Otherwise, invoke + * synchronize_sched() to wait for a full grace period. * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. */ -void synchronize_sched_expedited(void) +void cond_synchronize_sched(unsigned long oldstate) { - cpumask_var_t cm; - bool cma = false; - int cpu; - long firstsnap, s, snap; - int trycount = 0; - struct rcu_state *rsp = &rcu_sched_state; + unsigned long newstate; /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_wrap); + newstate = smp_load_acquire(&rcu_sched_state.completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WRITE_ONCE(*sp, *sp + 1); + WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(*sp) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) return; - } + rsp->ncpus_snap = ncpus; /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave(&rnp_up->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } } - WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ - cma = zalloc_cpumask_var(&cm, GFP_KERNEL); - if (cma) { - cpumask_copy(cm, cpu_online_mask); - cpumask_clear_cpu(raw_smp_processor_id(), cm); - for_each_cpu(cpu, cm) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the root rcu_node's exp_funnel_mutex and the + * specified rcu_node structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - cpumask_clear_cpu(cpu, cm); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; } - if (cpumask_weight(cm) == 0) - goto all_cpus_idle; + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + wake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the root + * rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp, + atomic_long_t *stat, unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + if (rnp) + mutex_unlock(&rnp->exp_funnel_mutex); + else if (rdp) + mutex_unlock(&rdp->exp_funnel_mutex); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp; + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. + * First try directly acquiring the root lock in order to reduce + * latency in the common case where expedited grace periods are + * rare. We check mutex_is_locked() to avoid pathological levels of + * memory contention on ->exp_funnel_mutex in the heavy-load case. */ - while (try_stop_cpus(cma ? cm : cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - free_cpumask_var(cm); - return; + rnp0 = rcu_get_root(rsp); + if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { + if (mutex_trylock(&rnp0->exp_funnel_mutex)) { + if (sync_exp_work_done(rsp, rnp0, NULL, + &rsp->expedited_workdone0, s)) + return NULL; + return rnp0; } + } - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; - } + /* + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. + */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rdp->exp_funnel_mutex); + rnp0 = rdp->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone2, s)) + return NULL; + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + else + mutex_unlock(&rdp->exp_funnel_mutex); + rnp1 = rnp0; + } + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone3, s)) + return NULL; + return rnp1; +} - /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - free_cpumask_var(cm); - return; +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. */ - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, use normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + } else { + /* Failed, raced with offline. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, + flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave(&rnp->lock, + flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } } - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); } - atomic_long_inc(&rsp->expedited_stoppedcpus); +} -all_cpus_idle: - free_cpumask_var(cm); +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = wait_event_interruptible_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + wait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + rcu_for_each_leaf_node(rsp, rnp) { + (void)rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); + pr_cont(" } %lu jiffies s: %lu\n", + jiffies - jiffies_start, rsp->expedited_sequence); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} - put_online_cpus(); +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + + rnp = exp_funnel_lock(rsp, s); + if (rnp == NULL) + return; /* Someone else did our work for us. */ + + rcu_exp_gp_seq_start(rsp); + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + synchronize_sched_expedited_wait(rsp); + + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -3462,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce && + rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { - rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && - (rdp->passed_quiesce || + rdp->n_rp_core_needs_qs++; + } else if (rdp->core_needs_qs && + (!rdp->cpu_no_qs.b.norm || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; @@ -3571,10 +3994,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); } } @@ -3586,7 +4009,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -3599,55 +4022,24 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = READ_ONCE(rsp->n_barrier_done); - unsigned long snap_done; + unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, snap); + _rcu_barrier_trace(rsp, "Begin", -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + /* Did someone else do our work for us? */ + if (rcu_seq_done(&rsp->barrier_sequence, s)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; } - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * WRITE_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + /* Mark the start of the barrier operation. */ + rcu_seq_start(&rsp->barrier_sequence); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3671,10 +4063,10 @@ static void _rcu_barrier(struct rcu_state *rsp) if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, @@ -3682,11 +4074,11 @@ static void _rcu_barrier(struct rcu_state *rsp) } } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } } put_online_cpus(); @@ -3698,16 +4090,13 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); + /* Mark the end of the barrier operation. */ + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + rcu_seq_end(&rsp->barrier_sequence); + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); } @@ -3770,6 +4159,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; + mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3790,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3812,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + if (!rdp->beenonline) + WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); + rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; - rdp->passed_quiesce = false; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); - rdp->qs_pending = false; + rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3849,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: + sync_sched_exp_online_cleanup(cpu); rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: @@ -3860,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self, rcu_cleanup_dying_cpu(rsp); break; case CPU_DYING_IDLE: + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) { rcu_cleanup_dying_idle_cpu(cpu, rsp); } @@ -3961,22 +4361,22 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; if (rcu_fanout_exact) { - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = RCU_FANOUT; + levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; } } @@ -3988,23 +4388,19 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static const char * const buf[] = { - "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static const char * const fqs[] = { - "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = RCU_NODE_NAME_INIT; + static const char * const fqs[] = RCU_FQS_NAME_INIT; + static const char * const exp[] = RCU_EXP_NAME_INIT; static u8 fl_mask = 0x1; + + int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ + int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; int j; struct rcu_node *rnp; - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ /* Silence gcc 4.8 false positive about array index out of range. */ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) @@ -4013,19 +4409,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; + levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); + rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; + rcu_init_levelspread(levelspread, levelcnt); rsp->flavor_mask = fl_mask; fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; + cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + for (j = 0; j < levelcnt[i]; j++, rnp++) { raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); @@ -4045,18 +4441,22 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->grpmask = 0; rnp->parent = NULL; } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpnum = j % levelspread[i - 1]; rnp->grpmask = 1UL << rnp->grpnum; rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; + j / levelspread[i - 1]; } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); + mutex_init(&rnp->exp_funnel_mutex); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } init_waitqueue_head(&rsp->gp_wq); + init_waitqueue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -4076,9 +4476,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int j; - int n = nr_cpu_ids; - int rcu_capacity[MAX_RCU_LVLS + 1]; + int rcu_capacity[RCU_NUM_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4101,47 +4499,51 @@ static void __init rcu_init_geometry(void) rcu_fanout_leaf, nr_cpu_ids); /* + * The boot-time rcu_fanout_leaf parameter must be at least two + * and cannot exceed the number of bits in the rcu_node masks. + * Complain and fall back to the compile-time values if this + * limit is exceeded. + */ + if (rcu_fanout_leaf < 2 || + rcu_fanout_leaf > sizeof(unsigned long) * 8) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; + WARN_ON(1); + return; + } + + /* * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. + * with the given number of levels. */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[0] = rcu_fanout_leaf; + for (i = 1; i < RCU_NUM_LVLS; i++) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. + * The tree must be able to accommodate the configured number of CPUs. + * If this limit is exceeded, fall back to the compile-time values. */ - if (rcu_fanout_leaf < RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; } + /* Calculate the number of levels in the tree. */ + for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { + } + rcu_num_lvls = i + 1; + /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } + for (i = 0; i < rcu_num_lvls; i++) { + int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; + num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); + } /* Calculate the total number of rcu_node structures. */ rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4adb7ca0bf47..9fb4e238d4dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/stop_machine.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -36,8 +37,6 @@ * Of course, your mileage may vary. */ -#define MAX_RCU_LVLS 4 - #ifdef CONFIG_RCU_FANOUT #define RCU_FANOUT CONFIG_RCU_FANOUT #else /* #ifdef CONFIG_RCU_FANOUT */ @@ -66,38 +65,45 @@ #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - extern int rcu_num_lvls; extern int rcu_num_nodes; @@ -157,16 +163,21 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blkd_tasks */ - /* elements that need to drain to allow the */ - /* current expedited grace period to */ - /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initial value for qsmask & expmask. */ + /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; /* Online CPUs for next grace period. */ + unsigned long expmask; /* CPUs or groups that need to check in */ + /* to allow the current expedited GP */ + /* to complete. */ + unsigned long expmaskinit; + /* Per-GP initial values for expmask. */ + /* Initialized from ->expmaskinitnext at the */ + /* beginning of each expedited GP. */ + unsigned long expmaskinitnext; + /* Online CPUs for next expedited GP. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -236,6 +247,8 @@ struct rcu_node { int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; + + struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -265,6 +278,18 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Union to allow "aggregate OR" operation on the need for a quiescent + * state by the normal and expedited grace periods. + */ +union rcu_noqs { + struct { + u8 norm; + u8 exp; + } b; /* Bits. */ + u16 s; /* Set of bits, aggregate OR here. */ +}; + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -281,18 +306,16 @@ struct rcu_data { /* is aware of having started. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ - bool passed_quiesce; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ + union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ + bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -346,7 +369,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_qs_pending; + unsigned long n_rp_core_needs_qs; unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; @@ -355,11 +378,12 @@ struct rcu_data { unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + struct mutex exp_funnel_mutex; /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -387,21 +411,12 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ int cpu; struct rcu_state *rsp; }; -/* Values for fqs_state field in struct rcu_state. */ -#define RCU_GP_IDLE 0 /* No grace period in progress. */ -#define RCU_GP_INIT 1 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK - /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOGP_WAKE_NOT 0 #define RCU_NOGP_WAKE 1 @@ -442,19 +457,18 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + struct rcu_node *level[RCU_NUM_LVLS + 1]; + /* Hierarchy levels (+1 to */ + /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ - void (*func)(struct rcu_head *head)); + call_rcu_func_t call; /* call_rcu() flavor. */ + int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ - u8 fqs_state ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - u8 boost; /* Subject to priority boost. */ + u8 boost ____cacheline_internodealigned_in_smp; + /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ @@ -479,21 +493,19 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ + unsigned long barrier_sequence; /* ++ at start and end of */ /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ + unsigned long expedited_sequence; /* Take a ticket. */ + atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ + atomic_t expedited_need_qs; /* # CPUs left to check in. */ + wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ @@ -524,10 +536,14 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ -/* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +/* Values for rcu_state structure's gp_state field. */ +#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ -#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ +#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ extern struct list_head rcu_struct_flavors; @@ -564,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -635,3 +652,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485fb2b06..630c19772630 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) pr_info("\tRCU torture testing starts during boot.\n"); - if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); - if (NUM_RCU_LVL_4 != 0) - pr_info("\tFour-level hierarchy is enabled.\n"); + if (RCU_NUM_LVLS >= 4) + pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); @@ -103,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *const rcu_state_p = &rcu_preempt_state; static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; -static int rcu_preempted_readers_exp(struct rcu_node *rnp); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); @@ -116,6 +113,147 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } +/* Flags for rcu_preempt_ctxt_queue() decision table. */ +#define RCU_GP_TASKS 0x8 +#define RCU_EXP_TASKS 0x4 +#define RCU_GP_BLKD 0x2 +#define RCU_EXP_BLKD 0x1 + +/* + * Queues a task preempted within an RCU-preempt read-side critical + * section into the appropriate location within the ->blkd_tasks list, + * depending on the states of any ongoing normal and expedited grace + * periods. The ->gp_tasks pointer indicates which element the normal + * grace period is waiting on (NULL if none), and the ->exp_tasks pointer + * indicates which element the expedited grace period is waiting on (again, + * NULL if none). If a grace period is waiting on a given element in the + * ->blkd_tasks list, it also waits on all subsequent elements. Thus, + * adding a task to the tail of the list blocks any grace period that is + * already waiting on one of the elements. In contrast, adding a task + * to the head of the list won't block any grace period that is already + * waiting on one of the elements. + * + * This queuing is imprecise, and can sometimes make an ongoing grace + * period wait for a task that is not strictly speaking blocking it. + * Given the choice, we needlessly block a normal grace period rather than + * blocking an expedited grace period. + * + * Note that an endless sequence of expedited grace periods still cannot + * indefinitely postpone a normal grace period. Eventually, all of the + * fixed number of preempted tasks blocking the normal grace period that are + * not also blocking the expedited grace period will resume and complete + * their RCU read-side critical sections. At that point, the ->gp_tasks + * pointer will equal the ->exp_tasks pointer, at which point the end of + * the corresponding expedited grace period will also be the end of the + * normal grace period. + */ +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long flags) __releases(rnp->lock) +{ + int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + + (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + + (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); + struct task_struct *t = current; + + /* + * Decide where to queue the newly blocked task. In theory, + * this could be an if-statement. In practice, when I tried + * that, it was quite messy. + */ + switch (blkd_state) { + case 0: + case RCU_EXP_TASKS: + case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS: + case RCU_GP_TASKS + RCU_EXP_TASKS: + + /* + * Blocking neither GP, or first task blocking the normal + * GP but not blocking the already-waiting expedited GP. + * Queue at the head of the list to avoid unnecessarily + * blocking the already-waiting GPs. + */ + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_BLKD: + case RCU_GP_BLKD: + case RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + + /* + * First task arriving that blocks either GP, or first task + * arriving that blocks the expedited GP (with the normal + * GP already waiting), or a task arriving that blocks + * both GPs with both GPs already waiting. Queue at the + * tail of the list to avoid any GP waiting on any of the + * already queued tasks that are not blocking it. + */ + list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + + /* + * Second or subsequent task blocking the expedited GP. + * The task either does not block the normal GP, or is the + * first task blocking the normal GP. Queue just after + * the first task blocking the expedited GP. + */ + list_add(&t->rcu_node_entry, rnp->exp_tasks); + break; + + case RCU_GP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + + /* + * Second or subsequent task blocking the normal GP. + * The task does not block the expedited GP. Queue just + * after the first task blocking the normal GP. + */ + list_add(&t->rcu_node_entry, rnp->gp_tasks); + break; + + default: + + /* Yet another exercise in excessive paranoia. */ + WARN_ON_ONCE(1); + break; + } + + /* + * We have now queued the task. If it was the first one to + * block either grace period, update the ->gp_tasks and/or + * ->exp_tasks pointers, respectively, to reference the newly + * blocked tasks. + */ + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + rnp->gp_tasks = &t->rcu_node_entry; + if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) + rnp->exp_tasks = &t->rcu_node_entry; + raw_spin_unlock(&rnp->lock); + + /* + * Report the quiescent state for the expedited GP. This expedited + * GP should not be able to end until we report, so there should be + * no need to check for a subsequent expedited GP. (Though we are + * still in a quiescent state in any case.) + */ + if (blkd_state & RCU_EXP_BLKD && + t->rcu_read_unlock_special.b.exp_need_qs) { + t->rcu_read_unlock_special.b.exp_need_qs = false; + rcu_report_exp_rdp(rdp->rsp, rdp, true); + } else { + WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); + } + local_irq_restore(flags); +} + /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -127,11 +265,11 @@ static void __init rcu_bootup_announce(void) */ static void rcu_preempt_qs(void) { - if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { + if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->passed_quiesce, 1); + __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -169,42 +307,18 @@ static void rcu_preempt_note_context_switch(void) t->rcu_blocked_node = rnp; /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. Note that there is some uncertainty as - * to exactly when the current grace period started. - * We take a conservative approach, which can result - * in unnecessarily waiting on tasks that started very - * slightly after the current grace period began. C'est - * la vie!!! - * - * But first, note that the current CPU must still be - * on line! + * Verify the CPU's sanity, trace the preemption, and + * then queue the task as required based on the states + * of any ongoing and expedited grace periods. */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { - list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); - rnp->gp_tasks = &t->rcu_node_entry; - if (IS_ENABLED(CONFIG_RCU_BOOST) && - rnp->boost_tasks != NULL) - rnp->boost_tasks = rnp->gp_tasks; - } else { - list_add(&t->rcu_node_entry, &rnp->blkd_tasks); - if (rnp->qsmask & rdp->grpmask) - rnp->gp_tasks = &t->rcu_node_entry; - } trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_preempt_ctxt_queue(rnp, rdp, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -274,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; + struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; @@ -284,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. Because irqs are disabled, + * If RCU core is waiting for this CPU to exit its critical section, + * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; @@ -298,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t) } } + /* + * Respond to a request for an expedited grace period, but only if + * we were not preempted, meaning that we were running on the same + * CPU throughout. If we were preempted, the exp_need_qs flag + * would have been cleared at the time of the first preemption, + * and the quiescent state would be reported when we were dequeued. + */ + if (special.b.exp_need_qs) { + WARN_ON_ONCE(special.b.blocked); + t->rcu_read_unlock_special.b.exp_need_qs = false; + rdp = this_cpu_ptr(rcu_state_p->rda); + rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (!t->rcu_read_unlock_special.s) { + local_irq_restore(flags); + return; + } + } + /* Hardware IRQ handlers cannot block, complain if they get here. */ if (in_irq() || in_serving_softirq()) { lockdep_rcu_suspicious(__FILE__, __LINE__, "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", t->rcu_read_unlock_special.s, t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.exp_need_qs, t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; @@ -331,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - empty_exp = !rcu_preempted_readers_exp(rnp); + empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -355,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); + empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, @@ -418,8 +552,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#ifdef CONFIG_RCU_CPU_STALL_INFO - static void rcu_print_task_stall_begin(struct rcu_node *rnp) { pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -431,18 +563,6 @@ static void rcu_print_task_stall_end(void) pr_cont("\n"); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -466,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; +} + +/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -499,8 +640,8 @@ static void rcu_preempt_check_callbacks(void) return; } if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_data_p->qs_pending) && - !__this_cpu_read(rcu_data_p->passed_quiesce)) + __this_cpu_read(rcu_data_p->core_needs_qs) && + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -516,7 +657,7 @@ static void rcu_preempt_do_callbacks(void) /* * Queue a preemptible-RCU callback for invocation after a grace period. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 0); } @@ -538,10 +679,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; if (rcu_gp_is_expedited()) @@ -551,157 +692,41 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return !rcu_preempted_readers_exp(rnp) && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ - unsigned long flags; - unsigned long mask; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - wake_up(&sync_rcu_preempt_exp_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - rnp->expmask &= ~mask; - } -} - -/* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 1. If there - * are such tasks, set the ->expmask bits up the rcu_node tree and also - * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 - * that work is needed here. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static void -sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp_up; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp->expmask); - WARN_ON_ONCE(rnp->exp_tasks); - if (!rcu_preempt_has_tasks(rnp)) { - /* No blocked tasks, nothing to do. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - /* Call for Phase 2 and propagate ->expmask bits up the tree. */ - rnp->expmask = 1; - rnp_up = rnp; - while (rnp_up->parent) { - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - if (rnp_up->expmask & mask) - break; - raw_spin_lock(&rnp_up->lock); /* irqs already off */ - smp_mb__after_unlock_lock(); - rnp_up->expmask |= mask; - raw_spin_unlock(&rnp_up->lock); /* irqs still off */ - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 2. If the - * leaf rcu_node structure has its ->expmask field set, check for tasks. - * If there are some, clear ->expmask and set ->exp_tasks accordingly, - * then initiate RCU priority boosting. Otherwise, clear ->expmask and - * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, - * enabling rcu_read_unlock_special() to do the bit-clearing. - * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. */ -static void -sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) +static void sync_rcu_exp_handler(void *info) { - unsigned long flags; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - if (!rnp->expmask) { - /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Phase 1 is over. */ - rnp->expmask = 0; + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; /* - * If there are still blocked tasks, set up ->exp_tasks so that - * rcu_read_unlock_special() will wake us and then boost them. + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. */ - if (rcu_preempt_has_tasks(rnp)) { - rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; return; } - /* No longer any blocked tasks, so undo bit setting. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_report_exp_rnp(rsp, rnp, false); + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); } /** @@ -719,80 +744,28 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; + struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; - unsigned long snap; - int trycount = 0; + unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - if (!try_get_online_cpus()) { - /* CPU-hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu); - return; - } + rnp_unlock = exp_funnel_lock(rsp, s); + if (rnp_unlock == NULL) + return; /* Someone else did our work for us. */ - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto mb_ret; /* Others did our work for us. */ - } - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - put_online_cpus(); - wait_rcu_gp(call_rcu); - return; - } - } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto unlock_mb_ret; /* Others did our work for us. */ - } + rcu_exp_gp_seq_start(rsp); - /* force all RCU readers onto ->blkd_tasks lists. */ - synchronize_sched_expedited(); - - /* - * Snapshot current state of ->blkd_tasks lists into ->expmask. - * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() - * to start clearing them. Doing this in one phase leads to - * strange races between setting and clearing bits, so just say "no"! - */ - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init1(rsp, rnp); - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init2(rsp, rnp); - - put_online_cpus(); + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); - wait_event(sync_rcu_preempt_exp_wq, - sync_rcu_preempt_exp_done(rnp)); + synchronize_sched_expedited_wait(rsp); /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: - smp_mb(); /* ensure subsequent action seen after grace period. */ + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp_unlock->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -893,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1061,8 +1044,7 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. + * Priority-boosting kthread, one per leaf rcu_node. */ static int rcu_boost_kthread(void *arg) { @@ -1680,12 +1662,10 @@ static int rcu_oom_notify(struct notifier_block *self, */ atomic_set(&oom_callback_count, 1); - get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); cond_resched_rcu_qs(); } - put_online_cpus(); /* Unconditionally decrement: no need to wake ourselves up. */ atomic_dec(&oom_callback_count); @@ -1706,8 +1686,6 @@ early_initcall(rcu_register_oom_notifier); #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -1765,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", - cpu, ticks_value, ticks_title, + pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), @@ -1796,33 +1778,6 @@ static void increment_cpu_stall_ticks(void) raw_cpu_inc(rsp->rda->ticks_this_gp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - #ifdef CONFIG_RCU_NOCB_CPU /* diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3ea7ffc7d5c4..ef7093cc9b5c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", + seq_printf(m, "bcc: %d bseq: %lu\n", atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); + rsp->barrier_sequence); return 0; } @@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, + rdp->cpu_no_qs.b.norm, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), - rdp->qs_pending); + rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), - atomic_long_read(&rsp->expedited_tryfail), + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, + atomic_long_read(&rsp->expedited_workdone0), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_workdone3), atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); + atomic_read(&rsp->expedited_need_qs), + rsp->expedited_sequence / 2); return 0; } @@ -271,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", ulong2long(rsp->completed), ulong2long(gpnum), - rsp->fqs_state, + rsp->gp_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", @@ -364,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending); seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_qs_pending, + rdp->n_rp_core_needs_qs, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index afaecb7a799a..5f748c5a40f0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +/** + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. + */ +int rcu_read_lock_sched_held(void) +{ + int lockdep_opinion = 0; + + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + if (debug_locks) + lockdep_opinion = lock_is_held(&rcu_sched_lock_map); + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); +} +EXPORT_SYMBOL(rcu_read_lock_sched_held); +#endif + #ifndef CONFIG_TINY_RCU static atomic_t rcu_expedited_nesting = @@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } +EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void wait_rcu_gp(call_rcu_func_t crf) +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array) { - struct rcu_synchronize rcu; + int i; - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + /* Initialize and register callbacks for each flavor specified. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) { + might_sleep(); + continue; + } + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } + + /* Wait for all callbacks to be invoked. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) + continue; + wait_for_completion(&rs_array[i].completion); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } -EXPORT_SYMBOL_GPL(wait_rcu_gp); +EXPORT_SYMBOL_GPL(__wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) @@ -468,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void); * Post an RCU-tasks callback. First call must be from process context * after the scheduler if fully operational. */ -void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; @@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - rcu_lockdep_assert(!rcu_scheduler_active, - "synchronize_rcu_tasks called too soon"); + RCU_LOCKDEP_WARN(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); diff --git a/kernel/reboot.c b/kernel/reboot.c index d20c85d9f8c0..bd30a973fe94 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, kernel_restart(buffer); break; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE case LINUX_REBOOT_CMD_KEXEC: ret = kernel_kexec(); break; diff --git a/kernel/resource.c b/kernel/resource.c index fed052a1bc9f..f150dbbe6f62 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -/* - * Search for a resouce entry that fully contains the specified region. - * If found, return 1 if it is RAM, 0 if not. - * If not found, or region is not fully contained, return -1 +/** + * region_intersects() - determine intersection of region with known resources + * @start: region start address + * @size: size of region + * @name: name of resource (in iomem_resource) * - * Used by the ioremap functions to ensure the user is not remapping RAM and is - * a vast speed up over walking through the resource table page by page. + * Check if the specified region partially overlaps or fully eclipses a + * resource identified by @name. Return REGION_DISJOINT if the region + * does not overlap @name, return REGION_MIXED if the region overlaps + * @type and another resource, and return REGION_INTERSECTS if the + * region overlaps @type and no other defined resource. Note, that + * REGION_INTERSECTS is also returned in the case when the specified + * region overlaps RAM and undefined memory holes. + * + * region_intersect() is used by memory remapping functions to ensure + * the user is not remapping RAM and is a vast speed up over walking + * through the resource table page by page. */ -int region_is_ram(resource_size_t start, unsigned long size) +int region_intersects(resource_size_t start, size_t size, const char *name) { - struct resource *p; - resource_size_t end = start + size - 1; unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - const char *name = "System RAM"; - int ret = -1; + resource_size_t end = start + size - 1; + int type = 0; int other = 0; + struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (p->end < start) - continue; - - if (p->start <= start && end <= p->end) { - /* resource fully contains region */ - if ((p->flags != flags) || strcmp(p->name, name)) - ret = 0; - else - ret = 1; - break; - } - if (end < p->start) - break; /* not found */ + bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + + if (start >= p->start && start <= p->end) + is_type ? type++ : other++; + if (end >= p->start && end <= p->end) + is_type ? type++ : other++; + if (p->start >= start && p->end <= end) + is_type ? type++ : other++; } read_unlock(&resource_lock); - return ret; + + if (other == 0) + return type ? REGION_INTERSECTS : REGION_DISJOINT; + + if (type) + return REGION_MIXED; + + return REGION_DISJOINT; } void __weak arch_remove_reservations(struct resource *avail) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10081..4d568ac9319e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -623,18 +621,21 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); return cpu; @@ -816,7 +817,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -826,17 +827,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1151,15 +1154,45 @@ static int migration_cpu_stop(void *data) return 0; } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +/* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - cpumask_copy(&p->cpus_allowed, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; + + lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, DEQUEUE_SAVE); + } + if (running) + put_prev_task(rq, p); + + p->sched_class->set_cpus_allowed(p, new_mask); + + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE); +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1169,7 +1202,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1178,6 +1212,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1214,6 +1257,11 @@ out: return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1246,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); } @@ -1287,12 +1335,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; @@ -1528,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1595,6 +1649,15 @@ static void update_avg(u64 *avg, u64 sample) s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -1637,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1654,9 +1717,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); - p->state = TASK_RUNNING; + trace_sched_wakeup(p); + #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -1874,6 +1937,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto out; + trace_sched_waking(p); + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); @@ -1949,6 +2014,8 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; + trace_sched_waking(p); + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); @@ -2016,9 +2083,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -2058,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG + void set_numabalancing_state(bool enabled) { if (enabled) - sched_feat_set("NUMA"); + static_branch_enable(&sched_numa_balancing); else - sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool numabalancing_enabled; - -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; + static_branch_disable(&sched_numa_balancing); } -#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, @@ -2082,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2200,8 +2258,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2210,8 +2268,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -2293,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2302,16 +2362,21 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); + trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_woken) + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); + } #endif task_rq_unlock(rq, p, &flags); } @@ -2420,7 +2485,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -2454,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + rq->prev_mm = NULL; /* @@ -2461,15 +2541,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); - finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -2489,7 +2568,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) put_task_struct(prev); } - tick_nohz_task_switch(current); + tick_nohz_task_switch(); return rq; } @@ -2539,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); @@ -2614,13 +2700,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); @@ -2891,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -2985,7 +3076,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -2997,6 +3088,17 @@ static void __sched __schedule(void) rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3014,7 +3116,7 @@ static void __sched __schedule(void) rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3050,6 +3152,7 @@ static void __sched __schedule(void) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { @@ -3079,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(); + __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3119,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); - __schedule(); - preempt_active_exit(); + preempt_disable_notrace(); + __schedule(true); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity @@ -3172,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(); + __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3212,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); @@ -3244,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3276,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3294,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3302,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3354,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3363,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3684,10 +3780,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3743,7 +3836,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3868,7 +3961,7 @@ change: queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3878,11 +3971,15 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -3960,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -4340,7 +4438,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4492,7 +4590,7 @@ SYSCALL_DEFINE0(sched_yield) int __sched _cond_resched(void) { - if (should_resched()) { + if (should_resched(0)) { preempt_schedule_common(); return 1; } @@ -4510,7 +4608,7 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - int resched = should_resched(); + int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; lockdep_assert_held(lock); @@ -4532,7 +4630,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (should_resched()) { + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { local_bh_enable(); preempt_schedule_common(); local_bh_disable(); @@ -4865,13 +4963,22 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4888,10 +4995,11 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -4902,7 +5010,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } @@ -5021,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5030,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5124,24 +5232,47 @@ static void migrate_tasks(struct rq *dead_rq) break; /* - * Ensure rq->lock covers the entire task selection - * until the migration. + * pick_next_task assumes pinned rq->lock. */ lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + /* + * Rules for changing task_struct::cpus_allowed are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * + * Drop rq->lock is not quite as disastrous as it usually is + * because !cpu_active at this point, which means load-balance + * will not interfere. Also, stop-machine. + */ + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock(&rq->lock); + raw_spin_lock(&next->pi_lock); + raw_spin_lock(&rq->lock); + + /* + * Since we're inside stop-machine, _nothing_ should have + * changed the task, WARN if weird stuff happened, because in + * that case the above rq->lock drop is a fail too. + */ + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + raw_spin_unlock(&next->pi_lock); + continue; + } + /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - lockdep_unpin_lock(&rq->lock); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); } + raw_spin_unlock(&next->pi_lock); } rq->stop = stop; @@ -5311,8 +5442,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -5429,13 +5559,27 @@ static void set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } @@ -6367,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -6445,8 +6590,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { @@ -7134,9 +7281,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - /* nohz_full won't take effect without isolating the cpus. */ - tick_nohz_full_add_cpus_to(cpu_isolated_map); - sched_init_numa(); /* @@ -7369,7 +7513,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = preempt_count() + rcu_preempt_depth(); return (nested == preempt_offset); } @@ -7616,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7632,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7640,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } @@ -8068,7 +8212,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task) +static void cpu_cgroup_fork(struct task_struct *task, void *private) { sched_move_task(task); } @@ -8100,21 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, sched_move_task(task); } -static void cpu_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8446,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,11 +31,6 @@ static inline int right_child(int i) return (i << 1) + 2; } -static inline int dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -2,6 +2,7 @@ #define _LINUX_CPUDL_H #include <linux/sched.h> +#include <linux/sched/deadline.h> #define IDX_INVALID -1 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f5a64ffad176..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -555,48 +556,43 @@ drop_precision: } /* - * Atomically advance counter to the new value. Interrupts, vcpu - * scheduling, and scaling inaccuracies can cause cputime_advance - * to be occasionally called with a new value smaller than counter. - * Let's enforce atomicity. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. * - * Normally a caller will only go through this loop once, or not - * at all in case a previous caller updated counter the same jiffy. - */ -static void cputime_advance(cputime_t *counter, cputime_t new) -{ - cputime_t old; - - while (new > (old = READ_ONCE(*counter))) - cmpxchg_cputime(counter, old, new); -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -606,22 +602,46 @@ static void cputime_adjust(struct task_cputime *curr, if (utime == 0) { stime = rtime; - } else if (stime == 0) { - utime = rtime; - } else { - cputime_t total = stime + utime; + goto update; + } - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + if (stime == 0) { + utime = rtime; + goto update; } - cputime_advance(&prev->stime, stime); - cputime_advance(&prev->utime, utime); + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + + /* + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i + */ + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } +update: + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) @@ -633,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a17af35670a..8b0a15e285f9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Queueing this task back might have overloaded rq, check if we need * to kick someone away. */ - if (has_pushable_dl_tasks(rq)) + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + lockdep_unpin_lock(&rq->lock); push_dl_task(rq); + lockdep_pin_lock(&rq->lock); + } #endif unlock: @@ -953,7 +960,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) /* * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (relative) deadline is + * task if we have one and its (absolute) deadline is * smaller than our one... OTW we keep our runtime and * deadline. */ @@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) int target = find_later_rq(p); if (target != -1 && - dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr)) + (dl_time_before(p->dl.deadline, + cpu_rq(target)->dl.earliest_dl.curr) || + (cpu_rq(target)->dl.dl_nr_running == 0))) cpu = target; } rcu_read_unlock(); @@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (!dl_time_before(task->dl.deadline, + if (later_rq->dl.dl_nr_running && + !dl_time_before(task->dl.deadline, later_rq->dl.earliest_dl.curr)) { /* * Target rq has tasks of equal or earlier deadline, @@ -1563,7 +1572,7 @@ out: static void push_dl_tasks(struct rq *rq) { - /* Terminates as it moves a -deadline task */ + /* push_dl_task() will return true if it moved a -deadline task */ while (push_dl_task(rq)) ; } @@ -1657,7 +1666,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_dl_tasks(rq) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || @@ -1669,9 +1677,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { - struct rq *rq; struct root_domain *src_rd; - int weight; + struct rq *rq; BUG_ON(!dl_task(p)); @@ -1697,37 +1704,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - /* - * Update only if the task is actually running (i.e., - * it is on the rq AND it is not throttled). - */ - if (!on_dl_rq(&p->dl)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_dl_task(rq, p); - BUG_ON(!rq->dl.dl_nr_migratory); - rq->dl.dl_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_dl_task(rq, p); - rq->dl.dl_nr_migratory++; - } - - update_dl_migration(&rq->dl); + set_cpus_allowed_common(p, new_mask); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4222ec50ab88..641511771ae6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) - if (!se) { - struct sched_avg *avg = &cpu_rq(cpu)->avg; - P(avg->runnable_avg_sum); - P(avg->avg_period); + if (!se) return; - } - PN(se->exec_start); PN(se->vruntime); @@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif P(se->load.weight); #ifdef CONFIG_SMP - P(se->avg.runnable_avg_sum); - P(se->avg.running_avg_sum); - P(se->avg.avg_period); - P(se->avg.load_avg_contrib); - P(se->avg.utilization_avg_contrib); - P(se->avg.decay_count); + P(se->avg.load_avg); + P(se->avg.util_avg); #endif #undef PN #undef P @@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %lu\n", "load_avg", + cfs_rq->avg.load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", - cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", - cfs_rq->utilization_load_avg); + SEQ_printf(m, " .%-30s: %lu\n", "util_avg", + cfs_rq->avg.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", + atomic_long_read(&cfs_rq->removed_load_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", + atomic_long_read(&cfs_rq->removed_util_avg)); #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", - cfs_rq->tg_load_contrib); - SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", - cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", + cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", - atomic_read(&cfs_rq->tg->runnable_avg)); #endif #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP - P(se.avg.runnable_avg_sum); - P(se.avg.running_avg_sum); - P(se.avg.avg_period); - P(se.avg.load_avg_contrib); - P(se.avg.utilization_avg_contrib); - P(se.avg.decay_count); + P(se.avg.load_sum); + P(se.avg.util_sum); + P(se.avg.load_avg); + P(se.avg.util_avg); + P(se.avg.last_update_time); #endif P(policy); P(prio); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d113c3ba8bc4..824aa9f501a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update); - static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; - /* We should have no load, but we need to update last_decay. */ - update_cfs_rq_blocked_load(cfs_rq, 0); } } @@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) */ static u64 __sched_period(unsigned long nr_running) { - u64 period = sysctl_sched_latency; - unsigned long nr_latency = sched_nr_latency; - - if (unlikely(nr_running > nr_latency)) { - period = sysctl_sched_min_granularity; - period *= nr_running; - } - - return period; + if (unlikely(nr_running > sched_nr_latency)) + return nr_running * sysctl_sched_min_granularity; + else + return sysctl_sched_latency; } /* @@ -669,22 +659,38 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); -static inline void __update_task_entity_contrib(struct sched_entity *se); -static inline void __update_task_entity_utilization(struct sched_entity *se); +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are + * dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ -/* Give new task start runnable values to heavy its load in infant time */ -void init_task_runnable_average(struct task_struct *p) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) { - u32 slice; + struct sched_avg *sa = &se->avg; - slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; - p->se.avg.avg_period = slice; - __update_task_entity_contrib(&p->se); - __update_task_entity_utilization(&p->se); + sa->last_update_time = 0; + /* + * sched_avg's period_contrib should be strictly less then 1024, so + * we give it 1023 to make sure it is almost a period (1024us), and + * will definitely be update (after enqueue). + */ + sa->period_contrib = 1023; + sa->load_avg = scale_load_down(se->load.weight); + sa->load_sum = sa->load_avg * LOAD_AVG_MAX; + sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else -void init_task_runnable_average(struct task_struct *p) +void init_entity_runnable_average(struct sched_entity *se) { } #endif @@ -1415,8 +1421,9 @@ static bool numa_has_capacity(struct task_numa_env *env) * --------------------- vs --------------------- * src->compute_capacity dst->compute_capacity */ - if (src->load * dst->compute_capacity > - dst->load * src->compute_capacity) + if (src->load * dst->compute_capacity * env->imbalance_pct > + + dst->load * src->compute_capacity * 100) return true; return false; @@ -1702,8 +1709,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.avg_period; + delta = p->se.avg.load_sum / p->se.load.weight; + *period = LOAD_AVG_MAX; } p->last_sum_exec_runtime = runtime; @@ -2063,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -2151,7 +2158,7 @@ void task_numa_work(struct callback_head *work) struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; - long pages; + long pages, virtpages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -2197,9 +2204,11 @@ void task_numa_work(struct callback_head *work) start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ if (!pages) return; + down_read(&mm->mmap_sem); vma = find_vma(mm, start); if (!vma) { @@ -2234,18 +2243,22 @@ void task_numa_work(struct callback_head *work) start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = min(end, vma->vm_end); - nr_pte_updates += change_prot_numa(vma, start, end); + nr_pte_updates = change_prot_numa(vma, start, end); /* - * Scan sysctl_numa_balancing_scan_size but ensure that - * at least one PTE is updated so that unused virtual - * address space is quickly skipped. + * Try to scan sysctl_numa_balancing_size worth of + * hpages that have at least one present PTE that + * is not already pte-numa. If the VMA contains + * areas that are unused or already full of prot_numa + * PTEs, scan up to virtpages, to skip through those + * areas faster. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; start = end; - if (pages <= 0) + if (pages <= 0 || virtpages <= 0) goto out; cond_resched(); @@ -2351,12 +2364,12 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) long tg_weight; /* - * Use this CPU's actual weight instead of the last load_contribution - * to gain a more accurate current total weight. See - * update_cfs_rq_load_contribution(). + * Use this CPU's real-time load instead of the last load contribution + * as the updating of the contribution is delayed, and we will use the + * the real-time load to calc the share. See update_tg_load_avg(). */ tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_contrib; + tg_weight -= cfs_rq->tg_load_avg_contrib; tg_weight += cfs_rq->load.weight; return tg_weight; @@ -2429,14 +2442,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables below are dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ - /* Precomputed fixed inverse multiplies for multiplication by y^n */ static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -2485,9 +2490,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val *= runnable_avg_yN_inv[local_n]; - /* We don't use SRR here since we always want to round down. */ - return val >> 32; + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + return val; } /* @@ -2518,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 +#error "load tracking assumes 2^10 as unit" +#endif + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2546,23 +2556,22 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, - struct sched_avg *sa, - int runnable, - int running) +static __always_inline int +__update_load_avg(u64 now, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; - u32 runnable_contrib; - int delta_w, decayed = 0; - unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + u64 delta, scaled_delta, periods; + u32 contrib; + unsigned int delta_w, scaled_delta_w, decayed = 0; + unsigned long scale_freq, scale_cpu; - delta = now - sa->last_runnable_update; + delta = now - sa->last_update_time; /* * This should only happen when time goes backwards, which it * unfortunately does during sched clock init when we swap over to TSC. */ if ((s64)delta < 0) { - sa->last_runnable_update = now; + sa->last_update_time = now; return 0; } @@ -2573,26 +2582,35 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, delta >>= 10; if (!delta) return 0; - sa->last_runnable_update = now; + sa->last_update_time = now; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->avg_period % 1024; + delta_w = sa->period_contrib; if (delta + delta_w >= 1024) { - /* period roll-over */ decayed = 1; + /* how much left for next period will start over, we don't know yet */ + sa->period_contrib = 0; + /* * Now that we know we're crossing a period boundary, figure * out how much from delta we need to complete the current * period and accrue it. */ delta_w = 1024 - delta_w; - if (runnable) - sa->runnable_avg_sum += delta_w; + scaled_delta_w = cap_scale(delta_w, scale_freq); + if (weight) { + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } + } if (running) - sa->running_avg_sum += delta_w * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta_w; + sa->util_sum += scaled_delta_w * scale_cpu; delta -= delta_w; @@ -2600,341 +2618,221 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, periods = delta / 1024; delta %= 1024; - sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, - periods + 1); - sa->running_avg_sum = decay_load(sa->running_avg_sum, - periods + 1); - sa->avg_period = decay_load(sa->avg_period, - periods + 1); + sa->load_sum = decay_load(sa->load_sum, periods + 1); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods + 1); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - runnable_contrib = __compute_runnable_contrib(periods); - if (runnable) - sa->runnable_avg_sum += runnable_contrib; + contrib = __compute_runnable_contrib(periods); + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } if (running) - sa->running_avg_sum += runnable_contrib * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += runnable_contrib; + sa->util_sum += contrib * scale_cpu; } /* Remainder of delta accrued against u_0` */ - if (runnable) - sa->runnable_avg_sum += delta; + scaled_delta = cap_scale(delta, scale_freq); + if (weight) { + sa->load_sum += weight * scaled_delta; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * scaled_delta; + } if (running) - sa->running_avg_sum += delta * scale_freq - >> SCHED_CAPACITY_SHIFT; - sa->avg_period += delta; - - return decayed; -} - -/* Synchronize an entity's decay with its parenting cfs_rq.*/ -static inline u64 __synchronize_entity_decay(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 decays = atomic64_read(&cfs_rq->decay_counter); + sa->util_sum += scaled_delta * scale_cpu; - decays -= se->avg.decay_count; - se->avg.decay_count = 0; - if (!decays) - return 0; + sa->period_contrib += delta; - se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.utilization_avg_contrib = - decay_load(se->avg.utilization_avg_contrib, decays); + if (decayed) { + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + } - return decays; + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) -{ - struct task_group *tg = cfs_rq->tg; - long tg_contrib; - - tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; - tg_contrib -= cfs_rq->tg_load_contrib; - - if (!tg_contrib) - return; - - if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic_long_add(tg_contrib, &tg->load_avg); - cfs_rq->tg_load_contrib += tg_contrib; - } -} - /* - * Aggregate cfs_rq runnable averages into an equivalent task_group - * representation for computing load contributions. + * Updating tg's load_avg is necessary before update_cfs_share (which is done) + * and effective_load (which is not done because it is too costly). */ -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { - struct task_group *tg = cfs_rq->tg; - long contrib; - - /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->avg_period + 1); - contrib -= cfs_rq->tg_runnable_contrib; + long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; - if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { - atomic_add(contrib, &tg->runnable_avg); - cfs_rq->tg_runnable_contrib += contrib; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } -static inline void __update_group_entity_contrib(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = group_cfs_rq(se); - struct task_group *tg = cfs_rq->tg; - int runnable_avg; - - u64 contrib; - - contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div_u64(contrib, - atomic_long_read(&tg->load_avg) + 1); - - /* - * For group entities we need to compute a correction term in the case - * that they are consuming <1 cpu so that we would contribute the same - * load as a task of equal weight. - * - * Explicitly co-ordinating this measurement would be expensive, but - * fortunately the sum of each cpus contribution forms a usable - * lower-bound on the true value. - * - * Consider the aggregate of 2 contributions. Either they are disjoint - * (and the sum represents true value) or they are disjoint and we are - * understating by the aggregate of their overlap. - * - * Extending this to N cpus, for a given overlap, the maximum amount we - * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of - * cpus that overlap for this interval and w_i is the interval width. - * - * On a small machine; the first term is well-bounded which bounds the - * total error since w_i is a subset of the period. Whereas on a - * larger machine, while this first term can be larger, if w_i is the - * of consequential size guaranteed to see n_i*w_i quickly converge to - * our upper bound of 1-cpu. - */ - runnable_avg = atomic_read(&tg->runnable_avg); - if (runnable_avg < NICE_0_LOAD) { - se->avg.load_avg_contrib *= runnable_avg; - se->avg.load_avg_contrib >>= NICE_0_SHIFT; - } -} - -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, - runnable, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, - int force_update) {} -static inline void __update_tg_runnable_avg(struct sched_avg *sa, - struct cfs_rq *cfs_rq) {} -static inline void __update_group_entity_contrib(struct sched_entity *se) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void __update_task_entity_contrib(struct sched_entity *se) -{ - u32 contrib; - - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.avg_period + 1); - se->avg.load_avg_contrib = scale_load(contrib); -} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -/* Compute the current contribution to load_avg by se, return any delta */ -static long __update_entity_load_avg_contrib(struct sched_entity *se) +/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - long old_contrib = se->avg.load_avg_contrib; + struct sched_avg *sa = &cfs_rq->avg; + int decayed, removed = 0; - if (entity_is_task(se)) { - __update_task_entity_contrib(se); - } else { - __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); - __update_group_entity_contrib(se); + if (atomic_long_read(&cfs_rq->removed_load_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + sa->load_avg = max_t(long, sa->load_avg - r, 0); + sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + removed = 1; } - return se->avg.load_avg_contrib - old_contrib; -} + if (atomic_long_read(&cfs_rq->removed_util_avg)) { + long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); + sa->util_avg = max_t(long, sa->util_avg - r, 0); + sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + } + decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); -static inline void __update_task_entity_utilization(struct sched_entity *se) -{ - u32 contrib; +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif - /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ - contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); - contrib /= (se->avg.avg_period + 1); - se->avg.utilization_avg_contrib = scale_load(contrib); + return decayed || removed; } -static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct sched_entity *se, int update_tg) { - long old_contrib = se->avg.utilization_avg_contrib; - - if (entity_is_task(se)) - __update_task_entity_utilization(se); - else - se->avg.utilization_avg_contrib = - group_cfs_rq(se)->utilization_load_avg; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); - return se->avg.utilization_avg_contrib - old_contrib; -} + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + __update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); -static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, - long load_contrib) -{ - if (likely(load_contrib < cfs_rq->blocked_load_avg)) - cfs_rq->blocked_load_avg -= load_contrib; - else - cfs_rq->blocked_load_avg = 0; + if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + update_tg_load_avg(cfs_rq, 0); } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - -/* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta, utilization_delta; - int cpu = cpu_of(rq_of(cfs_rq)); - u64 now; + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; /* - * For a group entity we need to use their owned cfs_rq_clock_task() in - * case they are the parent of a throttled hierarchy. + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. */ - if (entity_is_task(se)) - now = cfs_rq_clock_task(cfs_rq); - else - now = cfs_rq_clock_task(group_cfs_rq(se)); + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); - if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, - cfs_rq->curr == se)) - return; + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } - contrib_delta = __update_entity_load_avg_contrib(se); - utilization_delta = __update_entity_utilization_avg_contrib(se); +skip_aging: + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} - if (!update_cfs_rq) - return; +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); - if (se->on_rq) { - cfs_rq->runnable_load_avg += contrib_delta; - cfs_rq->utilization_load_avg += utilization_delta; - } else { - subtract_blocked_load_contrib(cfs_rq, -contrib_delta); - } + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); } -/* - * Decay the load contributed by all blocked children and account this so that - * their contribution may appropriately discounted when they wake up. - */ -static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +/* Add the load generated by se into cfs_rq's load average */ +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 now = cfs_rq_clock_task(cfs_rq) >> 20; - u64 decays; - - decays = now - cfs_rq->last_decay; - if (!decays && !force_update) - return; + struct sched_avg *sa = &se->avg; + u64 now = cfs_rq_clock_task(cfs_rq); + int migrated, decayed; - if (atomic_long_read(&cfs_rq->removed_load)) { - unsigned long removed_load; - removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); - subtract_blocked_load_contrib(cfs_rq, removed_load); + migrated = !sa->last_update_time; + if (!migrated) { + __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); } - if (decays) { - cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, - decays); - atomic64_add(decays, &cfs_rq->decay_counter); - cfs_rq->last_decay = now; - } + decayed = update_cfs_rq_load_avg(now, cfs_rq); + + cfs_rq->runnable_load_avg += sa->load_avg; + cfs_rq->runnable_load_sum += sa->load_sum; - __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); + if (migrated) + attach_entity_load_avg(cfs_rq, se); + + if (decayed || migrated) + update_tg_load_avg(cfs_rq, 0); } -/* Add the load generated by se into cfs_rq's child load-average */ -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) +/* Remove the runnable load generated by se from cfs_rq's runnable load average */ +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * We track migrations using entity decay_count <= 0, on a wake-up - * migration we use a negative decay count to track the remote decays - * accumulated while sleeping. - * - * Newly forked tasks are enqueued with se->avg.decay_count == 0, they - * are seen by enqueue_entity_load_avg() as a migration with an already - * constructed load_avg_contrib. - */ - if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); - if (se->avg.decay_count) { - /* - * In a wake-up migration we have to approximate the - * time sleeping. This is because we can't synchronize - * clock_task between the two cpus, and it is not - * guaranteed to be read-safe. Instead, we can - * approximate this using our carried decays, which are - * explicitly atomically readable. - */ - se->avg.last_runnable_update -= (-se->avg.decay_count) - << 20; - update_entity_load_avg(se, 0); - /* Indicate that we're now synchronized and on-rq */ - se->avg.decay_count = 0; - } - wakeup = 0; - } else { - __synchronize_entity_decay(se); - } - - /* migrated tasks did not contribute to our blocked load */ - if (wakeup) { - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - update_entity_load_avg(se, 0); - } + update_load_avg(se, 1); - cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !wakeup); + cfs_rq->runnable_load_avg = + max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); + cfs_rq->runnable_load_sum = + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* - * Remove se's load from this cfs_rq child load-average, if the entity is - * transitioning to a blocked state we track its projected decay using - * blocked_load_avg. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) +void remove_entity_load_avg(struct sched_entity *se) { - update_entity_load_avg(se, 1); - /* we force update consideration on load-balancer moves */ - update_cfs_rq_blocked_load(cfs_rq, !sleep); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + +#ifndef CONFIG_64BIT + u64 last_update_time_copy; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); +#else + last_update_time = cfs_rq->avg.last_update_time; +#endif - cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; - cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; - if (sleep) { - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - } /* migrations, e.g. sleep=0 leave decay_count == 0 */ + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } /* @@ -2944,7 +2842,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, */ void idle_enter_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 1); } /* @@ -2954,24 +2851,33 @@ void idle_enter_fair(struct rq *this_rq) */ void idle_exit_fair(struct rq *this_rq) { - update_rq_runnable_avg(this_rq, 0); +} + +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->runnable_load_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; } static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_entity_load_avg(struct sched_entity *se, - int update_cfs_rq) {} -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} -static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int wakeup) {} -static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se, - int sleep) {} -static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, - int force_update) {} +static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void +enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void remove_entity_load_avg(struct sched_entity *se) {} + +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline int idle_balance(struct rq *rq) { @@ -3103,7 +3009,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3178,7 +3084,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -3268,7 +3174,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_entity_load_avg(se, 1); + update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3368,7 +3274,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev, 1); + update_load_avg(prev, 0); } cfs_rq->curr = NULL; } @@ -3384,8 +3290,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr, 1); - update_cfs_rq_blocked_load(cfs_rq, 1); + update_load_avg(curr, 1); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4258,14 +4163,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { - update_rq_runnable_avg(rq, rq->nr_running); + if (!se) add_nr_running(rq, 1); - } + hrtick_update(rq); } @@ -4319,14 +4223,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; + update_load_avg(se, 1); update_cfs_shares(cfs_rq); - update_entity_load_avg(se, 1); } - if (!se) { + if (!se) sub_nr_running(rq, 1); - update_rq_runnable_avg(rq, 1); - } + hrtick_update(rq); } @@ -4439,6 +4342,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs); +} + #ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the @@ -4460,7 +4369,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, static void update_idle_cpu_load(struct rq *this_rq) { unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); unsigned long pending_updates; /* @@ -4506,7 +4415,7 @@ void update_cpu_load_nohz(void) */ void update_cpu_load_active(struct rq *this_rq) { - unsigned long load = this_rq->cfs.runnable_load_avg; + unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ @@ -4514,12 +4423,6 @@ void update_cpu_load_active(struct rq *this_rq) __update_cpu_load(this_rq, load, 1); } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->cfs.runnable_load_avg; -} - /* * Return a low guess at the load of a migration-source cpu weighted * according to the scheduling class and "nice" value. @@ -4567,7 +4470,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = rq->cfs.runnable_load_avg; + unsigned long load_avg = weighted_cpuload(cpu); if (nr_running) return load_avg / nr_running; @@ -4686,7 +4589,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * w = rw_i + @wl */ - w = se->my_q->load.weight + wl; + w = cfs_rq_load_avg(se->my_q) + wl; /* * wl = S * s'_i; see (2) @@ -4707,7 +4610,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) /* * wl = dw_i = S * (s'_i - s_i); see (3) */ - wl -= se->load.weight; + wl -= se->avg.load_avg; /* * Recursively apply this logic to all parent groups to compute @@ -4730,26 +4633,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. In order + * to determine whether we should let the load spread vs consolodating to + * shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. With + * both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. Waker/wakee + * being client/server, worker/dispatcher, interrupt source or whatever is + * irrelevant, spread criteria is apparent partner count exceeds socket size. + */ static int wake_wide(struct task_struct *p) { + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - /* - * Yeah, it's the switching-frequency, could means many wakee or - * rapidly switch, use factor here will just help to automatically - * adjust the loose-degree, so bigger node will lead to more pull. - */ - if (p->wakee_flips > factor) { - /* - * wakee is somewhat hot, it needs certain amount of cpu - * resource, so if waker is far more hot, prefer to leave - * it alone. - */ - if (current->wakee_flips > (factor * p->wakee_flips)) - return 1; - } - - return 0; + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) @@ -4761,13 +4667,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; - /* - * If we wake multiple tasks be careful to not bounce - * ourselves around too much. - */ - if (wake_wide(p)) - return 0; - idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4781,14 +4680,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ if (sync) { tg = task_group(current); - weight = current->se.load.weight; + weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); - weight = p->se.load.weight; + weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle @@ -4976,32 +4875,39 @@ next: done: return target; } + /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). - * cfs.utilization_load_avg is the sum of running time of runnable tasks on a - * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. - * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE - * because of unfortunate rounding in avg_period and running_load_avg or just - * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range - * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) - return capacity; - - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util >= capacity) ? capacity : util; } /* @@ -5021,17 +4927,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = cpu; + int new_cpu = prev_cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - continue; + break; /* * If both cpu and prev_cpu are part of this domain, @@ -5045,17 +4951,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; + else if (!want_affine) + break; } - if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - new_cpu = select_idle_sibling(p, prev_cpu); - goto unlock; + if (affine_sd) { + sd = NULL; /* Prefer wake_affine over balance flags */ + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + new_cpu = cpu; } - while (sd) { + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, new_cpu); + + } else while (sd) { struct sched_group *group; int weight; @@ -5089,7 +4999,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } -unlock: rcu_read_unlock(); return new_cpu; @@ -5101,26 +5010,27 @@ unlock: * previous cpu. However, the caller only guarantees p->pi_lock is held; no * other assumptions, including the state of rq->lock, should be made. */ -static void -migrate_task_rq_fair(struct task_struct *p, int next_cpu) +static void migrate_task_rq_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - /* - * Load tracking: accumulate removed load so that it can be processed - * when we next update owning cfs_rq under rq->lock. Tasks contribute - * to blocked load iff they have a positive decay-count. It can never - * be negative here since on-rq tasks have decay-count == 0. + * We are supposed to update the task to "current" time, then its up to date + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting + * what current time is, so simply throw away the out-of-date time. This + * will result in the wakee task is less decayed, but giving the wakee more + * load sounds not bad. */ - if (se->avg.decay_count) { - se->avg.decay_count = -__synchronize_entity_decay(se); - atomic_long_add(se->avg.load_avg_contrib, - &cfs_rq->removed_load); - } + remove_entity_load_avg(&p->se); + + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - se->exec_start = 0; + p->se.exec_start = 0; +} + +static void task_dead_fair(struct task_struct *p) +{ + remove_entity_load_avg(&p->se); } #endif /* CONFIG_SMP */ @@ -5670,72 +5580,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns true if the destination node is the preferred node. - * Needs to match fbq_classify_rq(): if there is a runnable task - * that is not on its preferred node, we should identify it. + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. */ -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || - !(env->sd->flags & SD_NUMA)) { - return false; - } - - src_nid = cpu_to_node(env->src_cpu); - dst_nid = cpu_to_node(env->dst_cpu); - - if (src_nid == dst_nid) - return false; - - /* Encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; - - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return false; - - if (numa_group) { - src_faults = group_faults(p, src_nid); - dst_faults = group_faults(p, dst_nid); - } else { - src_faults = task_faults(p, src_nid); - dst_faults = task_faults(p, dst_nid); - } - - return dst_faults > src_faults; -} - - -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) -{ - struct numa_group *numa_group = rcu_dereference(p->numa_group); - unsigned long src_faults, dst_faults; - int src_nid, dst_nid; - - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) - return false; + if (!static_branch_likely(&sched_numa_balancing)) + return -1; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return false; + return -1; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return false; + return -1; - /* Migrating away from the preferred node is bad. */ - if (src_nid == p->numa_preferred_nid) - return true; + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) + return 1; + else + return -1; + } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return false; + return 0; if (numa_group) { src_faults = group_faults(p, src_nid); @@ -5749,16 +5626,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) } #else -static inline bool migrate_improves_locality(struct task_struct *p, +static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return false; -} - -static inline bool migrate_degrades_locality(struct task_struct *p, - struct lb_env *env) -{ - return false; + return -1; } #endif @@ -5768,7 +5639,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p, static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot = 0; + int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); @@ -5826,13 +5697,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env); - if (!tsk_cache_hot) - tsk_cache_hot = migrate_degrades_locality(p, env); + tsk_cache_hot = migrate_degrades_locality(p, env); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, env); - if (migrate_improves_locality(p, env) || !tsk_cache_hot || + if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot) { + if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } @@ -5906,6 +5777,13 @@ static int detach_tasks(struct lb_env *env) return 0; while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, + * which could at worst lead to a livelock crash. + */ + if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + break; + p = list_first_entry(tasks, struct task_struct, se.group_node); env->loop++; @@ -6015,39 +5893,6 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) -{ - struct sched_entity *se = tg->se[cpu]; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - return; - - update_cfs_rq_blocked_load(cfs_rq, 1); - - if (se) { - update_entity_load_avg(se, 1); - /* - * We pivot on our runnable average having decayed to zero for - * list removal. This generally implies that all our children - * have also been removed (modulo rounding error or bandwidth - * control); however, such cases are rare and we can fix these - * at enqueue. - * - * TODO: fix up out-of-order children on enqueue. - */ - if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) - list_del_leaf_cfs_rq(cfs_rq); - } else { - struct rq *rq = rq_of(cfs_rq); - update_rq_runnable_avg(rq, rq->nr_running); - } -} - static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6056,19 +5901,19 @@ static void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { - /* - * Note: We may want to consider periodically releasing - * rq->lock about these updates so that creating many task - * groups does not result in continually extending hold time. - */ - __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); - } + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + update_tg_load_avg(cfs_rq, 0); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6096,14 +5941,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; - load = div64_ul(load * se->avg.load_avg_contrib, - cfs_rq->runnable_load_avg + 1); + load = div64_ul(load * se->avg.load_avg, + cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; @@ -6115,17 +5960,25 @@ static unsigned long task_h_load(struct task_struct *p) struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); - return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, - cfs_rq->runnable_load_avg + 1); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq_load_avg(cfs_rq) + 1); } #else static inline void update_blocked_averages(int cpu) { + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq = &rq->cfs; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static unsigned long task_h_load(struct task_struct *p) { - return p->se.avg.load_avg_contrib; + return p->se.avg.load_avg; } #endif @@ -6146,7 +5999,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6222,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6264,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -6399,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6414,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6435,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; } -static enum group_type group_classify(struct lb_env *env, - struct sched_group *group, - struct sg_lb_stats *sgs) +static inline enum +group_type group_classify(struct sched_group *group, + struct sg_lb_stats *sgs) { if (sgs->group_no_capacity) return group_overloaded; @@ -6483,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) @@ -6508,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(env, group, sgs); + sgs->group_type = group_classify(group, sgs); } /** @@ -6642,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_overloaded; + sgs->group_type = group_classify(sg, sgs); } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -7822,8 +7655,22 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7836,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7867,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } @@ -8023,10 +7882,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); - - update_rq_runnable_avg(rq, 1); } /* @@ -8101,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -8124,44 +7999,50 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP - /* - * Remove our load from contribution when we leave sched_fair - * and ensure we don't carry in an old decay_count if we - * switch back. - */ - if (se->avg.decay_count) { - __synchronize_entity_decay(se); - subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); - } -#endif + /* Catch up with the cfs_rq and remove our load when we leave */ + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { -#ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!task_on_rq_queued(p)) - return; - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq, se); + + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + + if (task_on_rq_queued(p)) { + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); + } } /* Account for a task changing its policy or group. @@ -8190,62 +8071,22 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP - atomic64_set(&cfs_rq->decay_counter, 1); - atomic_long_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load_avg, 0); + atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); - se->vruntime += cfs_rq->min_vruntime; + #ifdef CONFIG_SMP - /* - * migrate_task_rq_fair() will have removed our previous - * contribution, but we must synchronize for ongoing future - * decay. - */ - se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; #endif - } + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) @@ -8257,8 +8098,11 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) + if (tg->se) { + if (tg->se[i]) + remove_entity_load_avg(tg->se[i]); kfree(tg->se[i]); + } } kfree(tg->cfs_rq); @@ -8295,6 +8139,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_entity_runnable_average(se); } return 1; @@ -8444,6 +8289,8 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, + .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd485f6..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) @@ -72,27 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, false) - -/* - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a - * higher number of hinting faults are recorded during active load - * balancing. - */ -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) - -/* - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a - * lower number of hinting faults have been recorded. As this has - * the potential to prevent a task ever migrating to a new node - * due to CPU overload it is disabled by default. - */ -SCHED_FEAT(NUMA_RESIST_LOWER, false) -#endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 594275ed2620..4a2ef5a02fd3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); + stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; @@ -83,10 +85,13 @@ void __weak arch_cpu_idle(void) */ void default_idle_call(void) { - if (current_clr_polling_and_test()) + if (current_clr_polling_and_test()) { local_irq_enable(); - else + } else { + stop_critical_timings(); arch_cpu_idle(); + start_critical_timings(); + } } static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, @@ -141,12 +146,6 @@ static void cpuidle_idle_call(void) } /* - * During the idle period, stop measuring the disabled irqs - * critical sections latencies - */ - stop_critical_timings(); - - /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period @@ -198,7 +197,6 @@ exit_idle: local_irq_enable(); rcu_idle_exit(); - start_critical_timings(); } DEFINE_PER_CPU(bool, cpu_dead_idle); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c65dac8c97cd..c4ae0f1fdf9b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0d193a243e96..e3cc16312046 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) /* * We ran out of runtime, see if we can borrow some from our neighbours. */ -static int do_balance_runtime(struct rt_rq *rt_rq) +static void do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; - int i, weight, more = 0; + int i, weight; u64 rt_period; weight = cpumask_weight(rd->span); @@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; rt_rq->rt_runtime += diff; - more = 1; if (rt_rq->rt_runtime == rt_period) { raw_spin_unlock(&iter->rt_runtime_lock); break; @@ -683,8 +682,6 @@ next: raw_spin_unlock(&iter->rt_runtime_lock); } raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; } /* @@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) } } -static int balance_runtime(struct rt_rq *rt_rq) +static void balance_runtime(struct rt_rq *rt_rq) { - int more = 0; - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; + return; if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); + do_balance_runtime(rt_rq); raw_spin_lock(&rt_rq->rt_runtime_lock); } - - return more; } #else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} +static inline void balance_runtime(struct rt_rq *rt_rq) {} #endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) @@ -2069,7 +2059,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || @@ -2077,45 +2066,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) push_rt_tasks(rq); } -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - if (!task_on_rq_queued(p)) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } - - update_rt_migration(&rq->rt); -} - /* Assumes rq->lock is held */ static void rq_online_rt(struct rq *rq) { @@ -2324,7 +2274,7 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 84d48790bb6d..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } */ #define RUNTIME_INF ((u64)~0ULL) +static inline int idle_policy(int policy) +{ + return policy == SCHED_IDLE; +} static inline int fair_policy(int policy) { return policy == SCHED_NORMAL || policy == SCHED_BATCH; @@ -98,6 +102,11 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } +static inline bool valid_policy(int policy) +{ + return idle_policy(policy) || fair_policy(policy) || + rt_policy(policy) || dl_policy(policy); +} static inline int task_has_rt_policy(struct task_struct *p) { @@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -static inline bool dl_time_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - /* * Tells if entity @a should preempt entity @b. */ @@ -245,7 +249,6 @@ struct task_group { #ifdef CONFIG_SMP atomic_long_t load_avg; - atomic_t runnable_avg; #endif #endif @@ -366,27 +369,20 @@ struct cfs_rq { #ifdef CONFIG_SMP /* - * CFS Load tracking - * Under CFS, load is tracked on a per-entity basis and aggregated up. - * This allows for the description of both thread and group usage (in - * the FAIR_GROUP_SCHED case). - * runnable_load_avg is the sum of the load_avg_contrib of the - * sched_entities on the rq. - * blocked_load_avg is similar to runnable_load_avg except that its - * the blocked sched_entities on the rq. - * utilization_load_avg is the sum of the average running time of the - * sched_entities on the rq. + * CFS load tracking */ - unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; - atomic64_t decay_counter; - u64 last_decay; - atomic_long_t removed_load; - + struct sched_avg avg; + u64 runnable_load_sum; + unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED - /* Required to track per-cpu representation of a task_group */ - u32 tg_runnable_contrib; - unsigned long tg_load_contrib; + unsigned long tg_load_avg_contrib; +#endif + atomic_long_t removed_load_avg, removed_util_avg; +#ifndef CONFIG_64BIT + u64 load_last_update_time_copy; +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED /* * h_load = weight * f(tg) * @@ -595,8 +591,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; - - struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -1013,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) -#else -extern bool numabalancing_enabled; -#endif /* CONFIG_SCHED_DEBUG */ -#else -#define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { @@ -1065,9 +1049,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) #endif @@ -1091,9 +1072,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ @@ -1169,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#define ENQUEUE_WAKEUP 1 -#define ENQUEUE_HEAD 2 +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_HEAD 0x02 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ #else -#define ENQUEUE_WAKING 0 +#define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 8 +#define ENQUEUE_REPLENISH 0x08 +#define ENQUEUE_RESTORE 0x10 -#define DEQUEUE_SLEEP 1 +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1206,7 +1190,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1239,7 +1223,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; @@ -1268,6 +1252,8 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); + #else static inline void idle_enter_fair(struct rq *rq) { } @@ -1319,7 +1305,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -extern void init_task_runnable_average(struct task_struct *p); +extern void init_entity_runnable_average(struct sched_entity *se); static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -1415,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 79ffec45a6ac..cbc67da10954 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 245df6b32b81..580ac2d4024f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; @@ -348,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; + const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -371,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(-ENOMEM); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, - seccomp_check_filter); + seccomp_check_filter, save_orig); if (ret < 0) { kfree(sfilter); return ERR_PTR(ret); @@ -470,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk) static inline void seccomp_filter_free(struct seccomp_filter *filter) { if (filter) { - bpf_prog_free(filter->prog); + bpf_prog_destroy(filter->prog); kfree(filter); } } @@ -549,7 +549,11 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (mode == 0) + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); @@ -650,6 +654,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ @@ -860,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) /* prctl interface doesn't have flags, so they are always zero. */ return do_seccomp(op, 0, uargs); } + +#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + unsigned long count = 0; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { + ret = -EINVAL; + goto out; + } + + filter = task->seccomp.filter; + while (filter) { + filter = filter->prev; + count++; + } + + if (filter_off >= count) { + ret = -ENOENT; + goto out; + } + count -= filter_off; + + filter = task->seccomp.filter; + while (filter && count > 1) { + filter = filter->prev; + count--; + } + + if (WARN_ON(count != 1 || !filter)) { + /* The filter tree shouldn't shrink while we're using it. */ + ret = -ENOENT; + goto out; + } + + fprog = filter->prog->orig_prog; + if (!fprog) { + /* This must be a new non-cBPF filter, since we save every + * every cBPF filter's orig_prog above when + * CONFIG_CHECKPOINT_RESTORE is enabled. + */ + ret = -EMEDIUMTYPE; + goto out; + } + + ret = fprog->len; + if (!data) + goto out; + + get_seccomp_filter(task); + spin_unlock_irq(&task->sighand->siglock); + + if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) + ret = -EFAULT; + + put_seccomp_filter(task); + return ret; + +out: + spin_unlock_irq(&task->sighand->siglock); + return ret; +} +#endif diff --git a/kernel/signal.c b/kernel/signal.c index 836df8dac6cc..0f6bbbe77b46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif #ifdef SEGV_BNDERR - err |= __put_user(from->si_lower, &to->si_lower); - err |= __put_user(from->si_upper, &to->si_upper); + if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); + } #endif break; case __SI_CHLD: @@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 7c434c39f02a..d264f59bff56 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data) if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); preempt_enable(); - if (ht->cleanup) + /* cleanup must mirror setup */ + if (ht->cleanup && td->status != HP_THREAD_NONE) ht->cleanup(td->cpu, cpu_online(td->cpu)); kfree(td); return 0; @@ -221,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (ht->pre_unpark) - ht->pre_unpark(cpu); - kthread_unpark(tsk); + if (!ht->selfparking) + kthread_unpark(tsk); } void smpboot_unpark_threads(unsigned int cpu) @@ -259,15 +259,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) { unsigned int cpu; - /* Unpark any threads that were voluntarily parked. */ - for_each_cpu_not(cpu, ht->cpumask) { - if (cpu_online(cpu)) { - struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); - if (tsk) - kthread_unpark(tsk); - } - } - /* We need to destroy also the parked threads of offline cpus */ for_each_possible_cpu(cpu) { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); @@ -281,19 +272,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) } /** - * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug + * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related + * to hotplug * @plug_thread: Hotplug thread descriptor + * @cpumask: The cpumask where threads run * * Creates and starts the threads on all online cpus. */ -int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) +int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, + const struct cpumask *cpumask) { unsigned int cpu; int ret = 0; if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(plug_thread->cpumask, cpu_possible_mask); + cpumask_copy(plug_thread->cpumask, cpumask); get_online_cpus(); mutex_lock(&smpboot_threads_lock); @@ -301,9 +295,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); + free_cpumask_var(plug_thread->cpumask); goto out; } - smpboot_unpark_thread(plug_thread, cpu); + if (cpumask_test_cpu(cpu, cpumask)) + smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); out: @@ -311,7 +307,7 @@ out: put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); +EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); /** * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index fd643d8c4b42..867bc20e1ef1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,13 +35,16 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { + struct task_struct *thread; + spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); -static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; /* @@ -70,22 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) } } +static void __cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); +} + /* queue @work to @stopper. if offline, @work is completed immediately */ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct task_struct *p = per_cpu(cpu_stopper_task, cpu); - unsigned long flags; spin_lock_irqsave(&stopper->lock, flags); - - if (stopper->enabled) { - list_add_tail(&work->list, &stopper->works); - wake_up_process(p); - } else + if (stopper->enabled) + __cpu_stop_queue_work(stopper, work); + else cpu_stop_signal_done(work->done, false); - spin_unlock_irqrestore(&stopper->lock, flags); } @@ -139,7 +144,7 @@ enum multi_stop_state { }; struct multi_stop_data { - int (*fn)(void *); + cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; @@ -211,6 +216,31 @@ static int multi_cpu_stop(void *data) return err; } +static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, + int cpu2, struct cpu_stop_work *work2) +{ + struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); + struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + int err; + + lg_double_lock(&stop_cpus_lock, cpu1, cpu2); + spin_lock_irq(&stopper1->lock); + spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + + err = -ENOENT; + if (!stopper1->enabled || !stopper2->enabled) + goto unlock; + + err = 0; + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); +unlock: + spin_unlock(&stopper2->lock); + spin_unlock_irq(&stopper1->lock); + lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + + return err; +} /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop @@ -245,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); - /* - * If we observe both CPUs active we know _cpu_down() cannot yet have - * queued its stop_machine works and therefore ours will get executed - * first. Or its not either one of our CPUs that's getting unplugged, - * in which case we don't care. - * - * This relies on the stopper workqueues to be FIFO. - */ - if (!cpu_active(cpu1) || !cpu_active(cpu2)) { + if (cpu1 > cpu2) + swap(cpu1, cpu2); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { preempt_enable(); return -ENOENT; } - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); - cpu_stop_queue_work(cpu1, &work1); - cpu_stop_queue_work(cpu2, &work2); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); - preempt_enable(); wait_for_completion(&done.completion); @@ -293,7 +312,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); static void queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, @@ -302,22 +320,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, struct cpu_stop_work *work; unsigned int cpu; - /* initialize works and done */ - for_each_cpu(cpu, cpumask) { - work = &per_cpu(stop_cpus_work, cpu); - work->fn = fn; - work->arg = arg; - work->done = done; - } - /* * Disable preemption while queueing to avoid getting * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ lg_global_lock(&stop_cpus_lock); - for_each_cpu(cpu, cpumask) - cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); + for_each_cpu(cpu, cpumask) { + work = &per_cpu(cpu_stopper.stop_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = done; + cpu_stop_queue_work(cpu, work); + } lg_global_unlock(&stop_cpus_lock); } @@ -454,45 +469,47 @@ repeat: } } +void stop_machine_park(int cpu) +{ + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + /* + * Lockless. cpu_stopper_thread() will take stopper->lock and flush + * the pending works before it parks, until then it is fine to queue + * the new works. + */ + stopper->enabled = false; + kthread_park(stopper->thread); +} + extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { - sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); + sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - struct cpu_stop_work *work; - unsigned long flags; - /* drain remaining works */ - spin_lock_irqsave(&stopper->lock, flags); - list_for_each_entry(work, &stopper->works, list) - cpu_stop_signal_done(work->done, false); - stopper->enabled = false; - spin_unlock_irqrestore(&stopper->lock, flags); + WARN_ON(!list_empty(&stopper->works)); } -static void cpu_stop_unpark(unsigned int cpu) +void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - spin_lock_irq(&stopper->lock); stopper->enabled = true; - spin_unlock_irq(&stopper->lock); + kthread_unpark(stopper->thread); } static struct smp_hotplug_thread cpu_stop_threads = { - .store = &cpu_stopper_task, + .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, - .setup = cpu_stop_unpark, .park = cpu_stop_park, - .pre_unpark = cpu_stop_unpark, .selfparking = true, }; @@ -508,6 +525,7 @@ static int __init cpu_stop_init(void) } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); + stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } @@ -515,7 +533,7 @@ early_initcall(cpu_stop_init); #ifdef CONFIG_STOP_MACHINE -int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -548,7 +566,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } -int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; @@ -582,7 +600,7 @@ EXPORT_SYMBOL_GPL(stop_machine); * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ -int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, +int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, diff --git a/kernel/sys.c b/kernel/sys.c index 259fda25eb6b..fa2f2f671a5c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * overall picture. */ err = -EACCES; - if (!S_ISREG(inode->i_mode) || - exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) + if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path)) goto exit; err = inode_permission(inode, MAY_EXEC); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7995ef5868d8..0623787ec67a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask); cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_modify_ldt); cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); @@ -193,6 +194,7 @@ cond_syscall(sys_mlock); cond_syscall(sys_munlock); cond_syscall(sys_mlockall); cond_syscall(sys_munlockall); +cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); @@ -218,6 +220,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); @@ -243,3 +246,6 @@ cond_syscall(sys_bpf); /* execveat */ cond_syscall(sys_execveat); + +/* membarrier */ +cond_syscall(sys_membarrier); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 19b62b522158..dc6858d6639e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,7 @@ #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/kexec.h> +#include <linux/bpf.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -621,7 +622,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, @@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", @@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_SMP */ #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) @@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = { .proc_handler = timer_migration_handler, }, #endif +#ifdef CONFIG_BPF_SYSCALL + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif { } }; @@ -1995,7 +2028,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2201,7 +2234,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int val = *valp; if (val < 0) { *negp = true; - *lvalp = (unsigned long)-val; + *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; @@ -2436,7 +2469,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2459,7 +2492,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; @@ -2484,7 +2517,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, unsigned long lval; if (val < 0) { *negp = true; - lval = (unsigned long)-val; + lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S deleted file mode 100644 index 3e9868d47535..000000000000 --- a/kernel/system_certificates.S +++ /dev/null @@ -1,20 +0,0 @@ -#include <linux/export.h> -#include <linux/init.h> - - __INITRODATA - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list) -VMLINUX_SYMBOL(system_certificate_list): -__cert_list_start: - .incbin "kernel/x509_certificate_list" -__cert_list_end: - - .align 8 - .globl VMLINUX_SYMBOL(system_certificate_list_size) -VMLINUX_SYMBOL(system_certificate_list_size): -#ifdef CONFIG_64BIT - .quad __cert_list_end - __cert_list_start -#else - .long __cert_list_end - __cert_list_start -#endif diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c deleted file mode 100644 index 875f64e8935b..000000000000 --- a/kernel/system_keyring.c +++ /dev/null @@ -1,106 +0,0 @@ -/* System trusted keyring for trusted public keys - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include <keys/system_keyring.h> -#include "module-internal.h" - -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); - -extern __initconst const u8 system_certificate_list[]; -extern __initconst const unsigned long system_certificate_list_size; - -/* - * Load the compiled-in keys - */ -static __init int system_trusted_keyring_init(void) -{ - pr_notice("Initialise system trusted keyring\n"); - - system_trusted_keyring = - keyring_alloc(".system_keyring", - KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(system_trusted_keyring)) - panic("Can't allocate system trusted keyring\n"); - - set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(system_trusted_keyring_init); - -/* - * Load the compiled-in list of X.509 certificates. - */ -static __init int load_system_certificate_list(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading compiled-in X.509 certificates\n"); - - p = system_certificate_list; - end = p + system_certificate_list_size; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), - "asymmetric", - NULL, - p, - plen, - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA | - KEY_ALLOC_TRUSTED); - if (IS_ERR(key)) { - pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - } else { - set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); - pr_notice("Loaded X.509 cert '%s'\n", - key_ref_to_ptr(key)->description); - key_ref_put(key); - } - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_system_certificate_list); diff --git a/kernel/task_work.c b/kernel/task_work.c index 8727032e3a6f..53fa971d000d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * This is like the signal handler which runs in kernel mode, but it doesn't * try to wake up the @task. * + * Note: there is no ordering guarantee on works queued here. + * * RETURNS: * 0 if succeeds or -ESRCH. */ @@ -108,16 +110,6 @@ void task_work_run(void) raw_spin_unlock_wait(&task->pi_lock); smp_mb(); - /* Reverse the list to run the works in fifo order */ - head = NULL; - do { - next = work->next; - work->next = head; - head = work; - work = next; - } while (work); - - work = head; do { next = work->next; work->func(work); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 579ce1b929af..4008d9f95dd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -92,12 +92,10 @@ config NO_HZ_FULL depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP - # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON - select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 50eb107f1198..a9b76a40319e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { - /* Transition with legacy set_mode() callback */ - if (dev->set_mode) { - /* Legacy callback doesn't support new modes */ - if (state > CLOCK_EVT_STATE_ONESHOT) - return -ENOSYS; - /* - * 'clock_event_state' and 'clock_event_mode' have 1-to-1 - * mapping until *_ONESHOT, and so a simple cast will work. - */ - dev->set_mode((enum clock_event_mode)state, dev); - dev->mode = (enum clock_event_mode)state; - return 0; - } - if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; @@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; - if (dev->set_mode) { - dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); - dev->mode = CLOCK_EVT_MODE_RESUME; - } else if (dev->tick_resume) { + if (dev->tick_resume) ret = dev->tick_resume(dev); - } return ret; } @@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind_device); -/* Sanity check of state transition callbacks */ -static int clockevents_sanity_check(struct clock_event_device *dev) -{ - /* Legacy set_mode() callback */ - if (dev->set_mode) { - /* We shouldn't be supporting new modes now */ - WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume || - dev->set_state_oneshot_stopped); - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - return 0; - } - - if (dev->features & CLOCK_EVT_FEAT_DUMMY) - return 0; - - return 0; -} - /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(clockevents_sanity_check(dev)); - /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f720e8..0d8fe8b8f727 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", @@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * return half the number of nanoseconds the hardware counter can technically * cover. This is done so that we can potentially detect problems caused by * delayed timers or bad hardware, which might result in time intervals that - * are larger then what the math used can handle without overflows. + * are larger than what the math used can handle without overflows. */ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { @@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur) */ static void clocksource_select(void) { - return __clocksource_select(false); + __clocksource_select(false); } static void clocksource_select_fallback(void) { - return __clocksource_select(true); + __clocksource_select(true); } #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ - static inline void clocksource_select(void) { } static inline void clocksource_select_fallback(void) { } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5c7ae4b641c4..435b8850dd80 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,7 +59,7 @@ /* * The timer bases: * - * There are more clockids then hrtimer bases. Thus, we index + * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. @@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { if (pinned || !base->migration_enabled) - return this_cpu_ptr(&hrtimer_bases); + return base; return &per_cpu(hrtimer_bases, get_nohz_timer_target()); } #else @@ -191,23 +191,32 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { - return this_cpu_ptr(&hrtimer_bases); + return base; } #endif /* - * Switch the timer base to the current CPU when possible. + * We switch the timer base to a power-optimized selected CPU target, + * if: + * - NO_HZ_COMMON is enabled + * - timer migration is enabled + * - the timer callback is not running + * - the timer is not the first expiring timer on the new target + * + * If one of the above requirements is not fulfilled we move the timer + * to the current CPU or leave it on the previously assigned CPU if + * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { - struct hrtimer_cpu_base *new_cpu_base, *this_base; + struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; - this_base = this_cpu_ptr(&hrtimer_bases); - new_cpu_base = get_target_base(this_base, pinned); + this_cpu_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; @@ -229,19 +238,19 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_base && + if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); - new_cpu_base = this_base; + new_cpu_base = this_cpu_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (new_cpu_base != this_base && + if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { - new_cpu_base = this_base; + new_cpu_base = this_cpu_base; goto again; } } @@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg) /* * Switch to high resolution mode */ -static int hrtimer_switch_to_hres(void) +static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { printk(KERN_WARNING "Could not switch to high resolution " "mode on CPU %d\n", base->cpu); - return 0; + return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; @@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void) tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - return 1; } static void clock_was_set_work(struct work_struct *work) @@ -718,7 +726,7 @@ void clock_was_set_delayed(void) static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } -static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_reprogram(struct hrtimer *timer, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index fb4d98c7fd43..149cc8086aea 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -99,7 +99,7 @@ static time64_t ntp_next_leap_sec = TIME64_MAX; static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ -static struct timespec pps_fbase; /* beginning of the last freq interval */ +static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ @@ -487,6 +487,11 @@ out: } #ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock(struct timespec now) +{ + return -ENODEV; +} + int __weak update_persistent_clock64(struct timespec64 now64) { struct timespec now; @@ -504,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { struct timespec64 now; - struct timespec next; + struct timespec64 next; int fail = 1; /* @@ -554,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_nsec -= NSEC_PER_SEC; } queue_delayed_work(system_power_efficient_wq, - &sync_cmos_work, timespec_to_jiffies(&next)); + &sync_cmos_work, timespec64_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) @@ -768,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ struct pps_normtime { - __kernel_time_t sec; /* seconds */ + s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; /* normalize the timestamp so that nsec is in the ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ -static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { .sec = ts.tv_sec, @@ -856,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR - "hardpps: PPSERROR: interval too long - %ld s\n", + "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } @@ -943,7 +948,7 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; @@ -964,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) } /* ok, now we have a base for frequency calculation */ - freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); /* check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 65430504ca26..af924470eac0 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); -extern void __hardpps(const struct timespec *, const struct timespec *); +extern void __hardpps(const struct timespec64 *, const struct timespec64 *); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 892e3dae0aac..f5e86d282d52 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * but barriers are not required because update_gt_cputime() * can handle concurrent updates. */ - WRITE_ONCE(cputimer->running, 1); + WRITE_ONCE(cputimer->running, true); } sample_cputime_atomic(times, &cputimer->cputime_atomic); } @@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long long expires; unsigned long soft; + /* + * If cputime_expires is zero, then there are no active + * per thread CPU timers. + */ + if (task_cputime_zero(&tsk->cputime_expires)) + return; + expires = check_timers_list(timers, firing, prof_ticks(tsk)); tsk_expires->prof_exp = expires_to_cputime(expires); @@ -911,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; /* Turn off cputimer->running. This is done without locking. */ - WRITE_ONCE(cputimer->running, 0); + WRITE_ONCE(cputimer->running, false); } static u32 onecputick; @@ -962,6 +969,19 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* + * If cputimer is not running, then there are no active + * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). + */ + if (!READ_ONCE(tsk->signal->cputimer.running)) + return; + + /* + * Signify that a thread is checking for process timers. + * Write access to this field is protected by the sighand lock. + */ + sig->cputimer.checking_timer = true; + + /* * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); @@ -1015,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk, sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); + + sig->cputimer.checking_timer = false; } /* @@ -1117,24 +1139,33 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { - struct task_cputime task_sample = { - .utime = utime, - .stime = stime, - .sum_exec_runtime = tsk->se.sum_exec_runtime - }; + struct task_cputime task_sample; + task_cputime(tsk, &task_sample.utime, &task_sample.stime); + task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } sig = tsk->signal; - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(sig->cputimer.running)) { + /* + * Check if thread group timers expired when the cputimer is + * running and no other thread in the group is already checking + * for thread group cputimers. These fields are read without the + * sighand lock. However, this is fine because this is meant to + * be a fastpath heuristic to determine whether we should try to + * acquire the sighand lock to check/handle timers. + * + * In the worst case scenario, if 'running' or 'checking_timer' gets + * set but the current thread doesn't see the change yet, we'll wait + * until the next thread in the group gets a scheduler interrupt to + * handle the timer. This isn't an issue in practice because these + * types of delays with signals actually getting sent are expected. + */ + if (READ_ONCE(sig->cputimer.running) && + !READ_ONCE(sig->cputimer.checking_timer)) { struct task_cputime group_sample; sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); @@ -1174,12 +1205,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) * put them on the firing list. */ check_thread_timers(tsk, &firing); - /* - * If there are any active process wide timers (POSIX 1.b, itimers, - * RLIMIT_CPU) cputimer must be running. - */ - if (READ_ONCE(tsk->signal->cputimer.running)) - check_process_timers(tsk, &firing); + + check_process_timers(tsk, &firing); /* * We must release these locks before taking any timer's lock. diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 3e7db49a2381..53d7184da0be 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -18,30 +18,23 @@ static struct hrtimer bctimer; -static void bc_set_mode(enum clock_event_mode mode, - struct clock_event_device *bc) +static int bc_shutdown(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - /* - * Note, we cannot cancel the timer here as we might - * run into the following live lock scenario: - * - * cpu 0 cpu1 - * lock(broadcast_lock); - * hrtimer_interrupt() - * bc_handler() - * tick_handle_oneshot_broadcast(); - * lock(broadcast_lock); - * hrtimer_cancel() - * wait_for_callback() - */ - hrtimer_try_to_cancel(&bctimer); - break; - default: - break; - } + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + return 0; } /* @@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { - .set_mode = bc_set_mode, + .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_KTIME | @@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - switch (ce_broadcast_hrtimer.mode) { - case CLOCK_EVT_MODE_ONESHOT: + if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) return HRTIMER_RESTART; - default: - return HRTIMER_NORESTART; - } + + return HRTIMER_NORESTART; } void tick_setup_hrtimer_broadcast(void) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f8bf47571dda..4fcd99e12aa0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev) int cpu; cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, newdev->cpumask)) - goto out_bc; - td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -401,7 +398,6 @@ void tick_shutdown(unsigned int cpu) * the set mode function! */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); - dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c792429e98c6..7c7ec4515983 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -197,27 +197,9 @@ static bool can_stop_full_tick(void) return true; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); - -/* - * Re-evaluate the need for the tick on the current CPU - * and restart it if necessary. - */ -void __tick_nohz_full_check(void) -{ - struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - - if (tick_nohz_full_cpu(smp_processor_id())) { - if (ts->tick_stopped && !is_idle_task(current)) { - if (!can_stop_full_tick()) - tick_nohz_restart_sched_tick(ts, ktime_get()); - } - } -} - static void nohz_full_kick_work_func(struct irq_work *work) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu) static void nohz_full_kick_ipi(void *info) { - __tick_nohz_full_check(); + /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } /* @@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void __tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(void) { unsigned long flags; @@ -308,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str) __setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* - * If we handle the timekeeping duty for full dynticks CPUs, - * we can't safely shutdown that CPU. + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; @@ -388,6 +371,12 @@ void __init tick_nohz_init(void) cpu_notifier(tick_nohz_cpu_down_callback, 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); + + /* + * We need at least one CPU to handle housekeeping work such + * as timekeeping, unbound timers, workqueues, ... + */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif @@ -705,21 +694,38 @@ out: return tick; } -static void tick_nohz_full_stop_tick(struct tick_sched *ts) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_full_update_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + if (!tick_nohz_full_cpu(cpu)) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (!can_stop_full_tick()) - return; - - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + if (can_stop_full_tick()) + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + else if (ts->tick_stopped) + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -849,7 +855,7 @@ void tick_nohz_irq_exit(void) if (ts->inidle) __tick_nohz_idle_enter(ts); else - tick_nohz_full_stop_tick(ts); + tick_nohz_full_update_tick(ts); } /** @@ -864,23 +870,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) -{ - /* Update jiffies first */ - tick_do_update_jiffies64(now); - update_cpu_load_nohz(); - - calc_load_exit_idle(); - touch_softlockup_watchdog(); - /* - * Cancel the scheduled timer and restore the tick - */ - ts->tick_stopped = 0; - ts->idle_exittime = now; - - tick_nohz_restart(ts, now); -} - static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/time/time.c b/kernel/time/time.c index 85d5bb1d67eb..86751c68e08d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs); unsigned int jiffies_to_usecs(const unsigned long j) { -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + /* + * Hz usually doesn't go much further MSEC_PER_SEC. + * jiffies_to_usecs() and usecs_to_jiffies() depend on that. + */ + BUILD_BUG_ON(HZ > USEC_PER_SEC); + +#if !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); #else # if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; @@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs); * @t: Timespec * @gran: Granularity in ns. * - * Truncate a timespec to a granularity. gran must be smaller than a second. - * Always rounds down. - * - * This function should be only used for timestamps returned by - * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because - * it doesn't handle the better resolution of the latter. + * Truncate a timespec to a granularity. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec timespec_trunc(struct timespec t, unsigned gran) { - /* - * Division is pretty slow so avoid it for common cases. - * Currently current_kernel_time() never returns better than - * jiffies resolution. Exploit that. - */ - if (gran <= jiffies_to_usecs(1) * 1000) { + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) { /* nothing */ - } else if (gran == 1000000000) { + } else if (gran == NSEC_PER_SEC) { t.tv_nsec = 0; - } else { + } else if (gran > 1 && gran < NSEC_PER_SEC) { t.tv_nsec -= t.tv_nsec % gran; + } else { + WARN(1, "illegal file time granularity: %u", gran); } return t; } @@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * value to a scaled second value. */ static unsigned long -__timespec_to_jiffies(unsigned long sec, long nsec) +__timespec64_to_jiffies(u64 sec, long nsec) { nsec = nsec + TICK_NSEC - 1; @@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec) sec = MAX_SEC_IN_JIFFIES; nsec = 0; } - return (((u64)sec * SEC_CONVERSION) + + return ((sec * SEC_CONVERSION) + (((u64)nsec * NSEC_CONVERSION) >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); + return __timespec64_to_jiffies((u64)sec, nsec); } -EXPORT_SYMBOL(timespec_to_jiffies); +unsigned long +timespec64_to_jiffies(const struct timespec64 *value) +{ + return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec); +} +EXPORT_SYMBOL(timespec64_to_jiffies); void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { /* * Convert jiffies to nanoseconds and separate with @@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) NSEC_PER_SEC, &rem); value->tv_nsec = rem; } -EXPORT_SYMBOL(jiffies_to_timespec); +EXPORT_SYMBOL(jiffies_to_timespec64); /* * We could use a similar algorithm to timespec_to_jiffies (with a diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c7388dee8635..c48688904f9f 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -39,7 +39,7 @@ define fmuls(b,n,d) { } define timeconst(hz) { - print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Automatically generated by kernel/time/timeconst.bc */\n" print "/* Time conversion constants for HZ == ", hz, " */\n" print "\n" diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bca3667a2de1..b1356b7ae570 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); #ifdef CONFIG_NTP_PPS /** - * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format * @ts_raw: pointer to the timespec to be set to raw monotonic time * @ts_real: pointer to the timespec to be set to the time of day * @@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); * same time atomically and stores the resulting timestamps in timespec * format. */ -void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = timespec64_to_timespec(tk->raw_time); + *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; @@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec_add_ns(ts_raw, nsecs_raw); - timespec_add_ns(ts_real, nsecs_real); + timespec64_add_ns(ts_raw, nsecs_raw); + timespec64_add_ns(ts_real, nsecs_real); } -EXPORT_SYMBOL(getnstime_raw_and_real); +EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); #endif /* CONFIG_NTP_PPS */ @@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; unsigned long flags; + int ret = 0; if (!timespec64_valid_strict(ts)) return -EINVAL; @@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts) ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { + ret = -EINVAL; + goto out; + } + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); tk_set_xtime(tk, ts); - +out: timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); @@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), ts64); - if (!timespec64_valid_strict(&tmp)) { + if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 || + !timespec64_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -1244,7 +1251,7 @@ void __init timekeeping_init(void) set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); tk_set_wall_to_mono(tk, tmp); - timekeeping_update(tk, TK_MIRROR); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1607,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs(tick_error); + tick_error = abs64(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; @@ -1667,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * - * Helper function that accumulates a the nsecs greater then a second + * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. * @@ -1719,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; @@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void) return timespec64_to_timespec(tk_xtime(tk)); } -struct timespec current_kernel_time(void) +struct timespec64 current_kernel_time64(void) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now; @@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void) now = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - return timespec64_to_timespec(now); + return now; } -EXPORT_SYMBOL(current_kernel_time); +EXPORT_SYMBOL(current_kernel_time64); struct timespec64 get_monotonic_coarse64(void) { @@ -2018,7 +2025,7 @@ int do_adjtimex(struct timex *txc) /** * hardpps() - Accessor function to NTP __hardpps function */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { unsigned long flags; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5e097fa9faf7..74591ba9474f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - if (likely(!timer->start_site)) + void *site; + + /* + * start_site can be concurrently reset by + * timer_stats_timer_clear_start_info() + */ + site = READ_ONCE(timer->start_site); + if (likely(!site)) return; - timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer_stats_update_stats(timer, timer->start_pid, site, timer->function, timer->start_comm, timer->flags); } @@ -807,8 +814,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->flags &= ~TIMER_BASEMASK; - timer->flags |= base->cpu; + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); } } @@ -867,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) if (mask == 0) return expires; - bit = find_last_bit(&mask, BITS_PER_LONG); + bit = __fls(mask); mask = (1UL << bit) - 1; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a4536e1e3e2a..f75e35b60149 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); - print_active_timers(m, base, now); + print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) @@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - if (dev->set_mode) { - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); SEQ_printf(m, "\n"); - } else { - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + } - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot_stopped) { - SEQ_printf(m, " oneshot stopped: "); - print_name_offset(m, dev->set_state_oneshot_stopped); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); } SEQ_printf(m, " event_handler: "); diff --git a/kernel/torture.c b/kernel/torture.c index 3e4840633d3e..44aa462d033f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -523,6 +523,7 @@ static int stutter; */ void stutter_wait(const char *title) { + cond_resched_rcu_qs(); while (READ_ONCE(stutter_pause_test) || (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..8d6363f42169 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help @@ -635,6 +635,13 @@ config TRACE_ENUM_MAP_FILE If unsure, say N +config TRACING_EVENTS_GPIO + bool "Trace gpio events" + depends on GPIOLIB + default y + help + Enable tracing events for gpio subsystem + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b3e6b39b6cf9..a990824c8604 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -103,7 +103,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); } } @@ -278,7 +278,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); return; } } @@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; @@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->trace_state = Blktrace_setup; ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); + if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); @@ -778,9 +775,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - if (!error && !bio_flagged(bio, BIO_UPTODATE)) - error = EIO; - __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, what, error, 0, NULL); } @@ -887,8 +881,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); + bio->bi_error, sizeof(rpdu), &rpdu); } } @@ -920,8 +913,8 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + sizeof(r), &r); } /** @@ -1347,6 +1340,7 @@ static const struct { static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; @@ -1355,7 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, t = te_blk_io_trace(iter->ent); what = t->action & ((1 << BLK_TC_SHIFT) - 1); - long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { @@ -1417,9 +1411,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else - trace_flags |= TRACE_ITER_CONTEXT_INFO; + tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } @@ -1485,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { - struct blk_trace *old_bt, *bt = NULL; + struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); @@ -1501,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q, blk_trace_setup_lba(bt, bdev); - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - ret = -EBUSY; + ret = -EBUSY; + if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - } if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..4228fd3682c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = { /* * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) { char *fmt = (char *) (long) r1; + bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; + u64 unsafe_addr; + char buf[64]; int i; /* @@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p') { + } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; i++; if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) return -EINVAL; fmt_cnt++; + if (fmt[i - 1] == 's') { + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + switch (fmt_cnt) { + case 1: + unsafe_addr = r3; + r3 = (long) buf; + break; + case 2: + unsafe_addr = r4; + r4 = (long) buf; + break; + case 3: + unsafe_addr = r5; + r5 = (long) buf; + break; + } + buf[0] = 0; + strncpy_from_unsafe(buf, + (void *) (long) unsafe_addr, + sizeof(buf)); + } continue; } @@ -158,6 +186,84 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* make sure event is local and doesn't have pmu::count */ + if (event->oncpu != smp_processor_id() || + event->pmu->count) + return -EINVAL; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +static const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_array *array = container_of(map, struct bpf_array, map); + void *data = (void *) (long) r4; + struct perf_sample_data sample_data; + struct perf_event *event; + struct perf_raw_record raw = { + .size = size, + .data = data, + }; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (unlikely(!event)) + return -ENOENT; + + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || + event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) + return -EINVAL; + + if (unlikely(event->oncpu != smp_processor_id())) + return -EOPNOTSUPP; + + perf_sample_data_init(&sample_data, 0, 0); + sample_data.raw = &raw; + perf_event_output(event, &sample_data, regs); + return 0; +} + +static const struct bpf_func_proto bpf_perf_event_output_proto = { + .func = bpf_perf_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -183,6 +289,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto; default: return NULL; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb11011b5292..3f743b147247 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -243,6 +243,11 @@ static void ftrace_sync_ipi(void *data) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static void update_function_graph_func(void); + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; +static bool fgraph_graph_time = true; + #else static inline void update_function_graph_func(void) { } #endif @@ -630,13 +635,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) @@ -912,7 +922,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) calltime = trace->rettime - trace->calltime; - if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + if (!fgraph_graph_time) { int index; index = trace->depth; @@ -3415,27 +3425,35 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -static int ftrace_match(char *str, char *regex, int len, int type) +/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */ +struct ftrace_glob { + char *search; + unsigned len; + int type; +}; + +static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; - switch (type) { + switch (g->type) { case MATCH_FULL: - if (strcmp(str, regex) == 0) + if (strcmp(str, g->search) == 0) matched = 1; break; case MATCH_FRONT_ONLY: - if (strncmp(str, regex, len) == 0) + if (strncmp(str, g->search, g->len) == 0) matched = 1; break; case MATCH_MIDDLE_ONLY: - if (strstr(str, regex)) + if (strstr(str, g->search)) matched = 1; break; case MATCH_END_ONLY: slen = strlen(str); - if (slen >= len && memcmp(str + slen - len, regex, len) == 0) + if (slen >= g->len && + memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; } @@ -3444,13 +3462,13 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) { struct ftrace_func_entry *entry; int ret = 0; entry = ftrace_lookup_ip(hash, rec->ip); - if (not) { + if (clear_filter) { /* Do nothing if it doesn't exist */ if (!entry) return 0; @@ -3467,42 +3485,68 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, + struct ftrace_glob *mod_g, int exclude_mod) { char str[KSYM_SYMBOL_LEN]; char *modname; kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - if (mod) { - /* module lookup requires matching the module */ - if (!modname || strcmp(modname, mod)) + if (mod_g) { + int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; + + /* blank module name to match all modules */ + if (!mod_g->len) { + /* blank module globbing: modname xor exclude_mod */ + if ((!exclude_mod) != (!modname)) + goto func_match; + return 0; + } + + /* not matching the module */ + if (!modname || !mod_matches) { + if (exclude_mod) + goto func_match; + else + return 0; + } + + if (mod_matches && exclude_mod) return 0; +func_match: /* blank search means to match all funcs in the mod */ - if (!len) + if (!func_g->len) return 1; } - return ftrace_match(str, regex, len, type); + return ftrace_match(str, func_g); } static int -match_records(struct ftrace_hash *hash, char *buff, - int len, char *mod, int not) +match_records(struct ftrace_hash *hash, char *func, int len, char *mod) { - unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; + struct ftrace_glob func_g = { .type = MATCH_FULL }; + struct ftrace_glob mod_g = { .type = MATCH_FULL }; + struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL; + int exclude_mod = 0; int found = 0; int ret; + int clear_filter; - if (len) { - type = filter_parse_regex(buff, len, &search, ¬); - search_len = strlen(search); + if (func) { + func_g.type = filter_parse_regex(func, len, &func_g.search, + &clear_filter); + func_g.len = strlen(func_g.search); + } + + if (mod) { + mod_g.type = filter_parse_regex(mod, strlen(mod), + &mod_g.search, &exclude_mod); + mod_g.len = strlen(mod_g.search); } mutex_lock(&ftrace_lock); @@ -3511,8 +3555,8 @@ match_records(struct ftrace_hash *hash, char *buff, goto out_unlock; do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = enter_record(hash, rec, not); + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { + ret = enter_record(hash, rec, clear_filter); if (ret < 0) { found = ret; goto out_unlock; @@ -3529,26 +3573,9 @@ match_records(struct ftrace_hash *hash, char *buff, static int ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - return match_records(hash, buff, len, NULL, 0); + return match_records(hash, buff, len, NULL); } -static int -ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) -{ - int not = 0; - - /* blank or '*' mean the same */ - if (strcmp(buff, "*") == 0) - buff[0] = 0; - - /* handle the case of 'dont filter this module' */ - if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { - buff[0] = 0; - not = 1; - } - - return match_records(hash, buff, strlen(buff), mod, not); -} /* * We register the module command as a template to show others how @@ -3557,10 +3584,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) static int ftrace_mod_callback(struct ftrace_hash *hash, - char *func, char *cmd, char *param, int enable) + char *func, char *cmd, char *module, int enable) { - char *mod; - int ret = -EINVAL; + int ret; /* * cmd == 'mod' because we only registered this func @@ -3569,21 +3595,11 @@ ftrace_mod_callback(struct ftrace_hash *hash, * you can tell which command was used by the cmd * parameter. */ - - /* we must have a module name */ - if (!param) - return ret; - - mod = strsep(¶m, ":"); - if (!strlen(mod)) - return ret; - - ret = ftrace_match_module_records(hash, func, mod); + ret = match_records(hash, func, strlen(func), module); if (!ret) - ret = -EINVAL; + return -EINVAL; if (ret < 0) return ret; - return 0; } @@ -3694,19 +3710,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; - int type, len, not; + int not; unsigned long key; int count = 0; - char *search; int ret; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3733,7 +3750,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, NULL, search, len, type)) + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -3806,24 +3823,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; + struct ftrace_glob func_g; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; - int type = MATCH_FULL; - int i, len = 0; - char *search; - int ret; + int i, ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) - glob = NULL; + func_g.search = NULL; else if (glob) { int not; - type = filter_parse_regex(glob, strlen(glob), &search, ¬); - len = strlen(search); + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); + func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -3852,10 +3869,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; /* do this last, since it is the most expensive */ - if (glob) { + if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); - if (!ftrace_match(str, glob, len, type)) + if (!ftrace_match(str, &func_g)) continue; } @@ -3884,7 +3901,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ftrace_free_entry(entry); } mutex_unlock(&ftrace_lock); - + out_unlock: mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); @@ -4600,21 +4617,21 @@ ftrace_graph_release(struct inode *inode, struct file *file) static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { + struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; - int search_len; int fail = 1; - int type, not; - char *search; + int not; bool exists; int i; /* decode regex */ - type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + func_g.type = filter_parse_regex(buffer, strlen(buffer), + &func_g.search, ¬); if (!not && *idx >= size) return -EBUSY; - search_len = strlen(search); + func_g.len = strlen(func_g.search); mutex_lock(&ftrace_lock); @@ -4625,7 +4642,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, NULL, search, search_len, type)) { + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { @@ -4778,17 +4795,6 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } -static void ftrace_swap_ips(void *a, void *b, int size) -{ - unsigned long *ipa = a; - unsigned long *ipb = b; - unsigned long t; - - t = *ipa; - *ipa = *ipb; - *ipb = t; -} - static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -4808,7 +4814,7 @@ static int ftrace_process_locs(struct module *mod, return 0; sort(start, count, sizeof(*start), - ftrace_cmp_ips, ftrace_swap_ips); + ftrace_cmp_ips, NULL); start_pg = ftrace_allocate_pages(count); if (!start_pg) @@ -5634,6 +5640,16 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; @@ -5692,7 +5708,7 @@ free: } static void -ftrace_graph_probe_sched_switch(void *ignore, +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; @@ -5702,7 +5718,7 @@ ftrace_graph_probe_sched_switch(void *ignore, * Does the user want to count the time a function was asleep. * If so, do not update the time stamps. */ - if (trace_flags & TRACE_ITER_SLEEP_TIME) + if (fgraph_sleep_time) return; timestamp = trace_clock_local(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..75f1d05ea82d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -400,6 +400,17 @@ struct rb_irq_work { }; /* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* * Used for which event context the event is in. * NMI = 0 * IRQ = 1 @@ -818,7 +829,7 @@ rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, * writer is ever on it, the previous pointer never points * back to the reader page. */ -static int rb_is_reader_page(struct buffer_page *page) +static bool rb_is_reader_page(struct buffer_page *page) { struct list_head *list = page->list.prev; @@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) -{ - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } -static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) { - struct buffer_page *tail_page; - struct ring_buffer_event *event; - unsigned long tail, write; + event->type_len = RINGBUF_TYPE_TIME_EXTEND; - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + return skip_time_extend(event); +} - /* set write to only the index of the write */ - write &= RB_WRITE_MASK; - tail = write - length; +static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. + * If we need to add a timestamp, then we + * add it to the start of the resevered space. */ - if (!tail) + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; delta = 0; + } - /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} - /* We reserved something on the buffer */ +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ - event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + /* zero length can cause confusions */ + if (!length) + length++; - local_inc(&tail_page->entries); + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* - * If this is the first commit on the page, then update - * its timestamp. + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). */ - if (!tail) - tail_page->page->time_stamp = ts; + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; - /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + return length; +} - return event; +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; } +#endif static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, @@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } } -static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer *buffer, - struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length) +static inline void rb_event_discard(struct ring_buffer_event *event) { - struct ring_buffer_event *event; - u64 ts, delta; - int nr_loops = 0; - int add_timestamp; - u64 diff; + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); - rb_start_commit(cpu_buffer); + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { - local_dec(&cpu_buffer->committing); - local_dec(&cpu_buffer->commits); - return NULL; - } -#endif +static inline bool +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; - length = rb_calculate_event_length(length); - again: - add_timestamp = 0; - delta = 0; + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; /* - * We allow for interrupts to reenter here and do a trace. - * If one does, it will cause this original code to loop - * back here. Even with heavy interrupts happening, this - * should only happen a few times in a row. If this happens - * 1000 times in a row, there must be either an interrupt - * storm or we have something buggy. - * Bail! + * The event first in the commit queue updates the + * time stamp. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) - goto out_fail; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} - /* make sure this diff is calculated here */ - barrier(); +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; - /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; - } + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); - if (unlikely(PTR_ERR(event) == -EAGAIN)) - goto again; - - if (!event) - goto out_fail; + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } - return event; + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - out_fail: - rb_end_commit(cpu_buffer); - return NULL; + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /* @@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; + + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - info->length; + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + info->delta = 0; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, tail, info); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = info->ts; + + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + info.length = rb_calculate_event_length(length); + again: + info.add_timestamp = 0; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); + } + + event = __rb_reserve_next(cpu_buffer, &info); + + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +/** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) @@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer @@ -2999,7 +3039,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = rb_set_head_page(cpu_buffer); @@ -3007,7 +3047,7 @@ static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) /* In case of error, head will be NULL */ if (unlikely(!head)) - return 1; + return true; return reader->read == rb_page_commit(reader) && (commit == reader || @@ -4227,7 +4267,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset); * rind_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ -int ring_buffer_empty(struct ring_buffer *buffer) +bool ring_buffer_empty(struct ring_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4245,10 +4285,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) local_irq_restore(flags); if (!ret) - return 0; + return false; } - return 1; + return true; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -4257,7 +4297,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); * @buffer: The ring buffer * @cpu: The CPU buffer to test */ -int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; @@ -4265,7 +4305,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; + return true; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a1503a027ee2..6df9a83e20d7 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -24,8 +24,8 @@ struct rb_page { static int wakeup_interval = 100; static int reader_finish; -static struct completion read_start; -static struct completion read_done; +static DECLARE_COMPLETION(read_start); +static DECLARE_COMPLETION(read_done); static struct ring_buffer *buffer; static struct task_struct *producer; @@ -60,12 +60,12 @@ MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); static int read_events; -static int kill_test; +static int test_error; -#define KILL_TEST() \ +#define TEST_ERROR() \ do { \ - if (!kill_test) { \ - kill_test = 1; \ + if (!test_error) { \ + test_error = 1; \ WARN_ON(1); \ } \ } while (0) @@ -75,6 +75,11 @@ enum event_status { EVENT_DROPPED, }; +static bool break_test(void) +{ + return test_error || kthread_should_stop(); +} + static enum event_status read_event(int cpu) { struct ring_buffer_event *event; @@ -87,7 +92,7 @@ static enum event_status read_event(int cpu) entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); return EVENT_DROPPED; } @@ -115,10 +120,10 @@ static enum event_status read_page(int cpu) rpage = bpage; /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; - for (i = 0; i < commit && !kill_test; i += inc) { + for (i = 0; i < commit && !test_error ; i += inc) { if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { - KILL_TEST(); + TEST_ERROR(); break; } @@ -128,7 +133,7 @@ static enum event_status read_page(int cpu) case RINGBUF_TYPE_PADDING: /* failed writes may be discarded events */ if (!event->time_delta) - KILL_TEST(); + TEST_ERROR(); inc = event->array[0] + 4; break; case RINGBUF_TYPE_TIME_EXTEND: @@ -137,12 +142,12 @@ static enum event_status read_page(int cpu) case 0: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; if (!event->array[0]) { - KILL_TEST(); + TEST_ERROR(); break; } inc = event->array[0] + 4; @@ -150,17 +155,17 @@ static enum event_status read_page(int cpu) default: entry = ring_buffer_event_data(event); if (*entry != cpu) { - KILL_TEST(); + TEST_ERROR(); break; } read++; inc = ((event->type_len + 1) * 4); } - if (kill_test) + if (test_error) break; if (inc <= 0) { - KILL_TEST(); + TEST_ERROR(); break; } } @@ -178,10 +183,14 @@ static void ring_buffer_consumer(void) read_events ^= 1; read = 0; - while (!reader_finish && !kill_test) { - int found; + /* + * Continue running until the producer specifically asks to stop + * and is ready for the completion. + */ + while (!READ_ONCE(reader_finish)) { + int found = 1; - do { + while (found && !test_error) { int cpu; found = 0; @@ -193,19 +202,25 @@ static void ring_buffer_consumer(void) else stat = read_page(cpu); - if (kill_test) + if (test_error) break; + if (stat == EVENT_FOUND) found = 1; + } - } while (found && !kill_test); + } + /* Wait till the producer wakes us up when there is more data + * available or when the producer wants us to finish reading. + */ set_current_state(TASK_INTERRUPTIBLE); if (reader_finish) break; schedule(); } + __set_current_state(TASK_RUNNING); reader_finish = 0; complete(&read_done); } @@ -263,10 +278,7 @@ static void ring_buffer_producer(void) if (cnt % wakeup_interval) cond_resched(); #endif - if (kthread_should_stop()) - kill_test = 1; - - } while (ktime_before(end_time, timeout) && !kill_test); + } while (ktime_before(end_time, timeout) && !break_test()); trace_printk("End ring buffer hammer\n"); if (consumer) { @@ -276,8 +288,6 @@ static void ring_buffer_producer(void) /* the completions must be visible before the finish var */ smp_wmb(); reader_finish = 1; - /* finish var visible before waking up the consumer */ - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_done); } @@ -287,7 +297,7 @@ static void ring_buffer_producer(void) entries = ring_buffer_entries(buffer); overruns = ring_buffer_overruns(buffer); - if (kill_test && !kthread_should_stop()) + if (test_error) trace_printk("ERROR!\n"); if (!disable_reader) { @@ -368,15 +378,14 @@ static void wait_to_die(void) static int ring_buffer_consumer_thread(void *arg) { - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { complete(&read_start); ring_buffer_consumer(); set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop() || kill_test) + if (break_test()) break; - schedule(); } __set_current_state(TASK_RUNNING); @@ -389,27 +398,27 @@ static int ring_buffer_consumer_thread(void *arg) static int ring_buffer_producer_thread(void *arg) { - init_completion(&read_start); - - while (!kthread_should_stop() && !kill_test) { + while (!break_test()) { ring_buffer_reset(buffer); if (consumer) { - smp_wmb(); wake_up_process(consumer); wait_for_completion(&read_start); } ring_buffer_producer(); - if (kill_test) + if (break_test()) goto out_kill; trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); + if (break_test()) + goto out_kill; schedule_timeout(HZ * SLEEP_TIME); } out_kill: + __set_current_state(TASK_RUNNING); if (!kthread_should_stop()) wait_to_die(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..2198a630ef58 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -214,12 +214,10 @@ __setup("alloc_snapshot", boot_alloc_snapshot); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; -static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - trace_boot_options = trace_boot_options_buf; return 0; } __setup("trace_options=", set_trace_boot_options); @@ -250,6 +248,19 @@ unsigned long long ns2usecs(cycle_t nsec) return nsec; } +/* trace_flags holds trace_options default values */ +#define TRACE_DEFAULT_FLAGS \ + (FUNCTION_DEFAULT_FLAGS | \ + TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ + TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS) + +/* trace_options that are only supported by global_trace */ +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ + TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) + + /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -262,7 +273,9 @@ unsigned long long ns2usecs(cycle_t nsec) * pages for the buffer for that CPU. Each CPU has the same number * of pages allocated for its buffer. */ -static struct trace_array global_trace; +static struct trace_array global_trace = { + .trace_flags = TRACE_DEFAULT_FLAGS, +}; LIST_HEAD(ftrace_trace_arrays); @@ -468,11 +481,29 @@ static inline void trace_access_lock_init(void) #endif -/* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs); + +#else +static inline void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) +{ +} + +#endif static void tracer_tracing_on(struct trace_array *tr) { @@ -518,7 +549,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) int alloc; int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -548,7 +579,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return size; } @@ -568,7 +599,7 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int pc; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; pc = preempt_count(); @@ -588,7 +619,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 4, pc); + ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); return 1; } @@ -834,34 +865,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } +/* + * TRACE_FLAGS is defined as a tuple matching bit masks with strings. + * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list + * of strings in the order that the enums were defined. + */ +#undef C +#define C(a, b) b + /* These must match the bit postions in trace_iterator_flags */ static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "trace_printk", - "ftrace_preempt", - "branch", - "annotate", - "userstacktrace", - "sym-userobj", - "printk-msg-only", - "context-info", - "latency-format", - "sleep-time", - "graph-time", - "record-cmd", - "overwrite", - "disable_on_free", - "irq-info", - "markers", - "function-trace", + TRACE_FLAGS NULL }; @@ -1204,13 +1219,17 @@ static inline int run_tracer_selftest(struct tracer *type) } #endif /* CONFIG_FTRACE_STARTUP_TEST */ +static void add_tracer_options(struct trace_array *tr, struct tracer *t); + +static void __init apply_trace_boot_options(void); + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer * * Register a new plugin tracer. */ -int register_tracer(struct tracer *type) +int __init register_tracer(struct tracer *type) { struct tracer *t; int ret = 0; @@ -1253,6 +1272,7 @@ int register_tracer(struct tracer *type) type->next = trace_types; trace_types = type; + add_tracer_options(&global_trace, type); out: tracing_selftest_running = false; @@ -1268,6 +1288,9 @@ int register_tracer(struct tracer *type) /* Do we want this tracer to start on bootup? */ tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; + + apply_trace_boot_options(); + /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1671,23 +1694,16 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve ring_buffer_unlock_commit(buffer, event); } -static inline void -__trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } - -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1729,22 +1745,15 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); - -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1873,24 +1882,17 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, } -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs) +static inline void ftrace_trace_stack(struct trace_array *tr, + struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc, struct pt_regs *regs) { - if (!(trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; __ftrace_trace_stack(buffer, flags, skip, pc, regs); } -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) -{ - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); -} - void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { @@ -1929,7 +1931,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) struct userstack_entry *entry; struct stack_trace trace; - if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -2173,7 +2175,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); } out: @@ -2225,7 +2227,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } out: preempt_enable_notrace(); @@ -2246,7 +2248,7 @@ int trace_array_printk(struct trace_array *tr, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2261,7 +2263,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -2602,7 +2604,7 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); struct trace_buffer *buf = iter->trace_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; @@ -2664,20 +2666,22 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpumask_test_cpu(iter->cpu, iter->started)) + if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - cpumask_set_cpu(iter->cpu, iter->started); + if (iter->started) + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) @@ -2687,8 +2691,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; @@ -2698,7 +2703,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { if (iter->iter_flags & TRACE_FILE_LAT_FMT) trace_print_lat_context(iter); else @@ -2718,13 +2723,14 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); @@ -2742,6 +2748,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; @@ -2749,7 +2756,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_HEX_FIELD(s, entry->pid); SEQ_PUT_HEX_FIELD(s, iter->cpu); SEQ_PUT_HEX_FIELD(s, iter->ts); @@ -2771,13 +2778,14 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD(s, entry->pid); SEQ_PUT_FIELD(s, iter->cpu); SEQ_PUT_FIELD(s, iter->ts); @@ -2826,6 +2834,8 @@ int trace_empty(struct trace_iterator *iter) /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; if (iter->lost_events) { @@ -2871,6 +2881,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) @@ -2879,13 +2890,15 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) return; @@ -3035,7 +3048,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; @@ -3230,7 +3243,7 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -3477,7 +3490,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) + if (tr->trace_flags & (1 << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); @@ -3542,7 +3555,7 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ - if (!!(trace_flags & mask) == !!enabled) + if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ @@ -3551,9 +3564,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) return -EINVAL; if (enabled) - trace_flags |= mask; + tr->trace_flags |= mask; else - trace_flags &= ~mask; + tr->trace_flags &= ~mask; if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); @@ -3565,8 +3578,10 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) #endif } - if (mask == TRACE_ITER_PRINTK) + if (mask == TRACE_ITER_PRINTK) { trace_printk_start_stop_comm(enabled); + trace_printk_control(enabled); + } return 0; } @@ -3577,6 +3592,7 @@ static int trace_set_options(struct trace_array *tr, char *option) int neg = 0; int ret = -ENODEV; int i; + size_t orig_len = strlen(option); cmp = strstrip(option); @@ -3600,9 +3616,36 @@ static int trace_set_options(struct trace_array *tr, char *option) mutex_unlock(&trace_types_lock); + /* + * If the first trailing whitespace is replaced with '\0' by strstrip, + * turn it back into a space. + */ + if (orig_len > strlen(option)) + option[strlen(option)] = ' '; + return ret; } +static void __init apply_trace_boot_options(void) +{ + char *buf = trace_boot_options_buf; + char *option; + + while (true) { + option = strsep(&buf, ","); + + if (!option) + break; + + if (*option) + trace_set_options(&global_trace, option); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4297,11 +4340,8 @@ int tracing_update_buffers(void) struct trace_option_dentry; -static struct trace_option_dentry * -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); - static void -destroy_trace_option_files(struct trace_option_dentry *topts); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); /* * Used to clear out the tracer before deletion of an instance. @@ -4320,20 +4360,13 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static void update_tracer_options(struct trace_array *tr, struct tracer *t) +static void add_tracer_options(struct trace_array *tr, struct tracer *t) { - static struct trace_option_dentry *topts; - /* Only enable if the directory has been created already. */ if (!tr->dir) return; - /* Currently, only the top instance has options */ - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - return; - - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); + create_trace_option_files(tr, t); } static int tracing_set_tracer(struct trace_array *tr, const char *buf) @@ -4402,7 +4435,6 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -4569,7 +4601,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -4626,11 +4658,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) static unsigned int trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { + struct trace_array *tr = iter->tr; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ @@ -5047,7 +5081,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (trace_flags & TRACE_ITER_STOP_ON_FREE) + if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -5080,7 +5114,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; - if (!(trace_flags & TRACE_ITER_MARKERS)) + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; if (cnt > TRACE_BUF_SIZE) @@ -6132,13 +6166,6 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) #include "trace_selftest.c" #endif -struct trace_option_dentry { - struct tracer_opt *opt; - struct tracer_flags *flags; - struct trace_array *tr; - struct dentry *entry; -}; - static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -6191,14 +6218,51 @@ static const struct file_operations trace_options_fops = { .llseek = generic_file_llseek, }; +/* + * In order to pass in both the trace_array descriptor as well as the index + * to the flag that the trace option file represents, the trace_array + * has a character array of trace_flags_index[], which holds the index + * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. + * The address of this character array is passed to the flag option file + * read/write callbacks. + * + * In order to extract both the index and the trace_array descriptor, + * get_tr_index() uses the following algorithm. + * + * idx = *ptr; + * + * As the pointer itself contains the address of the index (remember + * index[1] == 1). + * + * Then to get the trace_array descriptor, by subtracting that index + * from the ptr, we get to the start of the index itself. + * + * ptr - idx == &index[0] + * + * Then a simple container_of() from that pointer gets us to the + * trace_array descriptor. + */ +static void get_tr_index(void *data, struct trace_array **ptr, + unsigned int *pindex) +{ + *pindex = *(unsigned char *)data; + + *ptr = container_of(data - *pindex, struct trace_array, + trace_flags_index); +} + static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; char *buf; - if (trace_flags & (1 << index)) + get_tr_index(tr_index, &tr, &index); + + if (tr->trace_flags & (1 << index)) buf = "1\n"; else buf = "0\n"; @@ -6210,11 +6274,14 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = &global_trace; - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; unsigned long val; int ret; + get_tr_index(tr_index, &tr, &index); + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -6298,21 +6365,39 @@ create_trace_option_file(struct trace_array *tr, } -static struct trace_option_dentry * +static void create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; + struct trace_options *tr_topts; struct tracer_flags *flags; struct tracer_opt *opts; int cnt; + int i; if (!tracer) - return NULL; + return; flags = tracer->flags; if (!flags || !flags->opts) - return NULL; + return; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return; + + for (i = 0; i < tr->nr_topts; i++) { + /* + * Check if these flags have already been added. + * Some tracers share flags. + */ + if (tr->topts[i].tracer->flags == tracer->flags) + return; + } opts = flags->opts; @@ -6321,27 +6406,27 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return NULL; - - for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(tr, &topts[cnt], flags, - &opts[cnt]); - - return topts; -} - -static void -destroy_trace_option_files(struct trace_option_dentry *topts) -{ - int cnt; + return; - if (!topts) + tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), + GFP_KERNEL); + if (!tr_topts) { + kfree(topts); return; + } - for (cnt = 0; topts[cnt].opt; cnt++) - tracefs_remove(topts[cnt].entry); + tr->topts = tr_topts; + tr->topts[tr->nr_topts].tracer = tracer; + tr->topts[tr->nr_topts].topts = topts; + tr->nr_topts++; - kfree(topts); + for (cnt = 0; opts[cnt].name; cnt++) { + create_trace_option_file(tr, &topts[cnt], flags, + &opts[cnt]); + WARN_ONCE(topts[cnt].entry == NULL, + "Failed to create trace option: %s", + opts[cnt].name); + } } static struct dentry * @@ -6354,21 +6439,26 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, (void *)index, - &trace_options_core_fops); + return trace_create_file(option, 0644, t_options, + (void *)&tr->trace_flags_index[index], + &trace_options_core_fops); } -static __init void create_trace_options_dir(struct trace_array *tr) +static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; + bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; - for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(tr, trace_options[i], i); + for (i = 0; trace_options[i]; i++) { + if (top_level || + !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + create_trace_option_core_file(tr, trace_options[i], i); + } } static ssize_t @@ -6435,7 +6525,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size { enum ring_buffer_flags rb_flags; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; @@ -6505,6 +6595,30 @@ static void free_trace_buffers(struct trace_array *tr) #endif } +static void init_trace_flags_index(struct trace_array *tr) +{ + int i; + + /* Used by the trace options files */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) + tr->trace_flags_index[i] = i; +} + +static void __update_tracer_options(struct trace_array *tr) +{ + struct tracer *t; + + for (t = trace_types; t; t = t->next) + add_tracer_options(tr, t); +} + +static void update_tracer_options(struct trace_array *tr) +{ + mutex_lock(&trace_types_lock); + __update_tracer_options(tr); + mutex_unlock(&trace_types_lock); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -6530,6 +6644,8 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + tr->trace_flags = global_trace.trace_flags; + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&tr->start_lock); @@ -6555,6 +6671,8 @@ static int instance_mkdir(const char *name) } init_tracer_tracefs(tr, tr->dir); + init_trace_flags_index(tr); + __update_tracer_options(tr); list_add(&tr->list, &ftrace_trace_arrays); @@ -6580,6 +6698,7 @@ static int instance_rmdir(const char *name) struct trace_array *tr; int found = 0; int ret; + int i; mutex_lock(&trace_types_lock); @@ -6602,9 +6721,14 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); + for (i = 0; i < tr->nr_topts; i++) { + kfree(tr->topts[i].topts); + } + kfree(tr->topts); + kfree(tr->name); kfree(tr); @@ -6666,6 +6790,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + create_trace_options_dir(tr); + #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); @@ -6861,11 +6987,7 @@ static __init int tracer_init_tracefs(void) create_trace_instances(d_tracer); - create_trace_options_dir(&global_trace); - - /* If the tracer was started via cmdline, create options for it here */ - if (global_trace.current_trace != &nop_trace) - update_tracer_options(&global_trace, global_trace.current_trace); + update_tracer_options(&global_trace); return 0; } @@ -6964,6 +7086,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static atomic_t dump_running; + struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; @@ -6990,13 +7113,13 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; switch (oops_dump_mode) { case DUMP_ALL: @@ -7059,7 +7182,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - trace_flags |= old_userobj; + tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); @@ -7074,6 +7197,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + /* + * Make sure we don't accidently add more trace options + * than we have bits for. + */ + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -7132,6 +7261,8 @@ __init static int tracer_alloc_buffers(void) ftrace_init_global_array_ops(&global_trace); + init_trace_flags_index(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ @@ -7148,12 +7279,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); - while (trace_boot_options) { - char *option; - - option = strsep(&trace_boot_options, ","); - trace_set_options(&global_trace, option); - } + apply_trace_boot_options(); register_snapshot_cmd(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 74bde81601a9..dd7620802e72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -71,9 +71,6 @@ enum trace_type { tstruct \ } -#undef TP_ARGS -#define TP_ARGS(args...) args - #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) @@ -156,9 +153,12 @@ struct trace_array_cpu { pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; + + bool ignore_pid; }; struct tracer; +struct trace_option_dentry; struct trace_buffer { struct trace_array *tr; @@ -168,6 +168,19 @@ struct trace_buffer { int cpu; }; +#define TRACE_FLAGS_MAX_SIZE 32 + +struct trace_options { + struct tracer *tracer; + struct trace_option_dentry *topts; +}; + +struct trace_pid_list { + unsigned int nr_pids; + int order; + pid_t *pids; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -193,6 +206,7 @@ struct trace_array { bool allocated_snapshot; unsigned long max_latency; #endif + struct trace_pid_list __rcu *filtered_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are @@ -216,13 +230,17 @@ struct trace_array { #endif int stop_count; int clock_id; + int nr_topts; struct tracer *current_trace; + unsigned int trace_flags; + unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; struct dentry *event_dir; + struct trace_options *topts; struct list_head systems; struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ @@ -333,6 +351,13 @@ struct tracer_flags { #define TRACER_OPT(s, b) .name = #s, .bit = b +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct trace_array *tr; + struct dentry *entry; +}; + /** * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file @@ -611,29 +636,12 @@ void update_max_tr_single(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc); - -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs); - void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc); void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, int skip, int pc) -{ -} - -static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, - unsigned long flags, int skip, - int pc, struct pt_regs *regs) -{ -} - static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -707,8 +715,6 @@ int trace_array_printk_buf(struct ring_buffer *buffer, void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); -extern unsigned long trace_flags; - extern char trace_find_mark(unsigned long long duration); /* Standard output formatting function used for function return traces */ @@ -723,9 +729,14 @@ extern char trace_find_mark(unsigned long long duration); #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 #define TRACE_GRAPH_PRINT_TAIL 0x80 +#define TRACE_GRAPH_SLEEP_TIME 0x100 +#define TRACE_GRAPH_GRAPH_TIME 0x200 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) +extern void ftrace_graph_sleep_time_control(bool enable); +extern void ftrace_graph_graph_time_control(bool enable); + extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); @@ -859,7 +870,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops); #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ -int ftrace_event_is_function(struct trace_event_call *call); +bool ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces @@ -897,42 +908,94 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos); /* + * Only create function graph options if function graph is configured. + */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FGRAPH_FLAGS \ + C(DISPLAY_GRAPH, "display-graph"), +#else +# define FGRAPH_FLAGS +#endif + +#ifdef CONFIG_BRANCH_TRACER +# define BRANCH_FLAGS \ + C(BRANCH, "branch"), +#else +# define BRANCH_FLAGS +#endif + +#ifdef CONFIG_FUNCTION_TRACER +# define FUNCTION_FLAGS \ + C(FUNCTION, "function-trace"), +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +#else +# define FUNCTION_FLAGS +# define FUNCTION_DEFAULT_FLAGS 0UL +#endif + +#ifdef CONFIG_STACKTRACE +# define STACK_FLAGS \ + C(STACKTRACE, "stacktrace"), +#else +# define STACK_FLAGS +#endif + +/* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * * NOTE: These bits must match the trace_options array in - * trace.c. + * trace.c (this macro guarantees it). + */ +#define TRACE_FLAGS \ + C(PRINT_PARENT, "print-parent"), \ + C(SYM_OFFSET, "sym-offset"), \ + C(SYM_ADDR, "sym-addr"), \ + C(VERBOSE, "verbose"), \ + C(RAW, "raw"), \ + C(HEX, "hex"), \ + C(BIN, "bin"), \ + C(BLOCK, "block"), \ + C(PRINTK, "trace_printk"), \ + C(ANNOTATE, "annotate"), \ + C(USERSTACKTRACE, "userstacktrace"), \ + C(SYM_USEROBJ, "sym-userobj"), \ + C(PRINTK_MSGONLY, "printk-msg-only"), \ + C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ + C(LATENCY_FMT, "latency-format"), \ + C(RECORD_CMD, "record-cmd"), \ + C(OVERWRITE, "overwrite"), \ + C(STOP_ON_FREE, "disable_on_free"), \ + C(IRQ_INFO, "irq-info"), \ + C(MARKERS, "markers"), \ + FUNCTION_FLAGS \ + FGRAPH_FLAGS \ + STACK_FLAGS \ + BRANCH_FLAGS + +/* + * By defining C, we can make TRACE_FLAGS a list of bit names + * that will define the bits for the flag masks. */ -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_PRINTK = 0x200, - TRACE_ITER_PREEMPTONLY = 0x400, - TRACE_ITER_BRANCH = 0x800, - TRACE_ITER_ANNOTATE = 0x1000, - TRACE_ITER_USERSTACKTRACE = 0x2000, - TRACE_ITER_SYM_USEROBJ = 0x4000, - TRACE_ITER_PRINTK_MSGONLY = 0x8000, - TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x20000, - TRACE_ITER_SLEEP_TIME = 0x40000, - TRACE_ITER_GRAPH_TIME = 0x80000, - TRACE_ITER_RECORD_CMD = 0x100000, - TRACE_ITER_OVERWRITE = 0x200000, - TRACE_ITER_STOP_ON_FREE = 0x400000, - TRACE_ITER_IRQ_INFO = 0x800000, - TRACE_ITER_MARKERS = 0x1000000, - TRACE_ITER_FUNCTION = 0x2000000, +#undef C +#define C(a, b) TRACE_ITER_##a##_BIT + +enum trace_iterator_bits { + TRACE_FLAGS + /* Make sure we don't go more than we have bits for */ + TRACE_ITER_LAST_BIT }; /* + * By redefining C, we can make TRACE_FLAGS a list of masks that + * use the bits as defined above. + */ +#undef C +#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) + +enum trace_iterator_flags { TRACE_FLAGS }; + +/* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ @@ -946,7 +1009,7 @@ extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { - if (trace_flags & TRACE_ITER_BRANCH) + if (tr->trace_flags & TRACE_ITER_BRANCH) return enable_branch_tracing(tr); return 0; } @@ -1269,6 +1332,7 @@ extern const char *__stop___trace_bprintk_fmt[]; extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; +void trace_printk_control(bool enabled); void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 40a14cbcf8e0..0f109c4130d3 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -43,7 +43,7 @@ static void trace_do_benchmark(void) unsigned int std = 0; /* Only run if the tracepoint is actually active */ - if (!trace_benchmark_event_enabled()) + if (!trace_benchmark_event_enabled() || !tracing_is_on()) return; local_irq_disable(); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e2e12ad3186f..3a2a73716a5b 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -125,25 +125,14 @@ void disable_branch_tracing(void) mutex_unlock(&branch_tracing_mutex); } -static void start_branch_trace(struct trace_array *tr) -{ - enable_branch_tracing(tr); -} - -static void stop_branch_trace(struct trace_array *tr) -{ - disable_branch_tracing(); -} - static int branch_trace_init(struct trace_array *tr) { - start_branch_trace(tr); - return 0; + return enable_branch_tracing(tr); } static void branch_trace_reset(struct trace_array *tr) { - stop_branch_trace(tr); + disable_branch_tracing(); } static enum print_line_t trace_branch_print(struct trace_iterator *iter, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 404a372ad85a..6bbc5f652355 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,11 +15,15 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> +#include <linux/bsearch.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> +#include <trace/events/sched.h> + #include <asm/setup.h> #include "trace_output.h" @@ -30,6 +34,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -37,21 +42,19 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; -#define SYSTEM_FL_FREE_NAME (1 << 31) - static inline int system_refcount(struct event_subsystem *system) { - return system->ref_count & ~SYSTEM_FL_FREE_NAME; + return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { - return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; + return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { - return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; + return --system->ref_count; } /* Double loops, do not use break, only goto's work */ @@ -94,6 +97,10 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + field = __find_event_field(&ftrace_common_fields, name); if (field) return field; @@ -144,6 +151,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, } EXPORT_SYMBOL_GPL(trace_define_field); +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type); \ + if (ret) \ + return ret; + #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ @@ -153,6 +167,16 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, cpu, FILTER_OTHER); + __generic_field(char *, comm, FILTER_PTR_STRING); + + return ret; +} + static int trace_define_common_fields(void) { int ret; @@ -190,12 +214,32 @@ int trace_event_raw_init(struct trace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) +{ + struct trace_array *tr = trace_file->tr; + struct trace_array_cpu *data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + if (!pid_list) + return false; + + data = this_cpu_ptr(tr->trace_buffer.data); + + return data->ignore_pid; +} +EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); + void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; + if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(trace_file)) + return NULL; + local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); fbuffer->trace_file = trace_file; @@ -316,6 +360,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; int ret = 0; int disable; @@ -379,7 +424,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); - if (trace_flags & TRACE_ITER_RECORD_CMD) { + if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } @@ -424,6 +469,142 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } +static int cmp_pid(const void *key, const void *elt) +{ + const pid_t *search_pid = key; + const pid_t *pid = elt; + + if (*search_pid == *pid) + return 0; + if (*search_pid < *pid) + return -1; + return 1; +} + +static bool +check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + pid_t search_pid; + pid_t *pid; + + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + search_pid = task->pid; + + pid = bsearch(&search_pid, filtered_pids->pids, + filtered_pids->nr_pids, sizeof(pid_t), + cmp_pid); + if (!pid) + return true; + + return false; +} + +static void +event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, prev) && + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_switch_probe_post(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, next)); +} + +static void +event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are already tracing */ + if (!this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, task)); +} + +static void +event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are not tracing */ + if (this_cpu_read(tr->trace_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + /* Set tracing if current is enabled */ + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + +static void __ftrace_clear_event_pids(struct trace_array *tr) +{ + struct trace_pid_list *pid_list; + struct trace_event_file *file; + int cpu; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (!pid_list) + return; + + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); + + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false; + + rcu_assign_pointer(tr->filtered_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); + + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); +} + +static void ftrace_clear_event_pids(struct trace_array *tr) +{ + mutex_lock(&event_mutex); + __ftrace_clear_event_pids(tr); + mutex_unlock(&event_mutex); +} + static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; @@ -438,8 +619,7 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); } @@ -757,6 +937,58 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + /* + * Grab the mutex, to keep calls to p_next() having the same + * tr->filtered_pids as p_start() has. + * If we just passed the tr->filtered_pids around, then RCU would + * have been enough, but doing that makes things more complex. + */ + mutex_lock(&event_mutex); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->filtered_pids); + + if (!pid_list || *pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static void p_stop(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock_sched(); + mutex_unlock(&event_mutex); +} + +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + + (*pos)++; + + if (*pos >= pid_list->nr_pids) + return NULL; + + return (void *)&pid_list->pids[*pos]; +} + +static int p_show(struct seq_file *m, void *v) +{ + pid_t *pid = v; + + seq_printf(m, "%d\n", *pid); + return 0; +} + static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1314,8 +1546,209 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int max_pids(struct trace_pid_list *pid_list) +{ + return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); +} + +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->filtered_pids, + mutex_is_locked(&event_mutex)); + + this_cpu_write(tr->trace_buffer.data->ignore_pid, + check_ignore_pid(pid_list, current)); +} + +static ssize_t +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list = NULL; + struct trace_event_file *file; + struct trace_parser parser; + unsigned long val; + loff_t this_pos; + ssize_t read = 0; + ssize_t ret = 0; + pid_t pid; + int i; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + mutex_lock(&event_mutex); + /* + * Load as many pids into the array before doing a + * swap from the tr->filtered_pids to the new list. + */ + while (cnt > 0) { + + this_pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &this_pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val > INT_MAX) + break; + + pid = (pid_t)val; + + ret = -ENOMEM; + if (!pid_list) { + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + break; + + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + if (filtered_pids) + pid_list->order = filtered_pids->order; + else + pid_list->order = 0; + + pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order); + if (!pid_list->pids) + break; + + if (filtered_pids) { + pid_list->nr_pids = filtered_pids->nr_pids; + memcpy(pid_list->pids, filtered_pids->pids, + pid_list->nr_pids * sizeof(pid_t)); + } else + pid_list->nr_pids = 0; + } + + if (pid_list->nr_pids >= max_pids(pid_list)) { + pid_t *pid_page; + + pid_page = (void *)__get_free_pages(GFP_KERNEL, + pid_list->order + 1); + if (!pid_page) + break; + memcpy(pid_page, pid_list->pids, + pid_list->nr_pids * sizeof(pid_t)); + free_pages((unsigned long)pid_list->pids, pid_list->order); + + pid_list->order++; + pid_list->pids = pid_page; + } + + pid_list->pids[pid_list->nr_pids++] = pid; + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + if (pid_list) + free_pages((unsigned long)pid_list->pids, pid_list->order); + kfree(pid_list); + mutex_unlock(&event_mutex); + return ret; + } + + if (!pid_list) { + mutex_unlock(&event_mutex); + return ret; + } + + sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); + + /* Remove duplicates */ + for (i = 1; i < pid_list->nr_pids; i++) { + int start = i; + + while (i < pid_list->nr_pids && + pid_list->pids[i - 1] == pid_list->pids[i]) + i++; + + if (start != i) { + if (i < pid_list->nr_pids) { + memmove(&pid_list->pids[start], &pid_list->pids[i], + (pid_list->nr_pids - i) * sizeof(pid_t)); + pid_list->nr_pids -= i - start; + i = start; + } else + pid_list->nr_pids = start; + } + } + + rcu_assign_pointer(tr->filtered_pids, pid_list); + + list_for_each_entry(file, &tr->events, list) { + set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + if (filtered_pids) { + synchronize_sched(); + + free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + kfree(filtered_pids); + } else { + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + } + + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. + */ + on_each_cpu(ignore_task_cpu, tr, 1); + + mutex_unlock(&event_mutex); + + ret = read; + *ppos += read; + + return ret; +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { @@ -1332,6 +1765,13 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = t_stop, }; +static const struct seq_operations show_set_pid_seq_ops = { + .start = p_start, + .next = p_next, + .show = p_show, + .stop = p_stop, +}; + static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, @@ -1347,6 +1787,14 @@ static const struct file_operations ftrace_set_event_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_set_event_pid_fops = { + .open = ftrace_event_set_pid_open, + .read = seq_read, + .write = ftrace_event_pid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, @@ -1457,6 +1905,26 @@ ftrace_event_set_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_event_set_pid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + static struct event_subsystem * create_new_subsystem(const char *name) { @@ -1470,13 +1938,9 @@ create_new_subsystem(const char *name) system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ - if (!core_kernel_data((unsigned long)name)) { - system->ref_count |= SYSTEM_FL_FREE_NAME; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) - goto out_free; - } else - system->name = name; + system->name = kstrdup_const(name, GFP_KERNEL); + if (!system->name) + goto out_free; system->filter = NULL; @@ -1489,8 +1953,7 @@ create_new_subsystem(const char *name) return system; out_free: - if (system->ref_count & SYSTEM_FL_FREE_NAME) - kfree(system->name); + kfree_const(system->name); kfree(system); return NULL; } @@ -2456,6 +2919,9 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = tracefs_create_file("set_event_pid", 0644, parent, + tr, &ftrace_set_event_pid_fops); + /* ring buffer internal formats */ trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, @@ -2536,6 +3002,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); + /* Clear the pid list */ + __ftrace_clear_event_pids(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); @@ -2573,16 +3042,16 @@ early_enable_events(struct trace_array *tr, bool disable_first) if (!token) break; - if (!*token) - continue; - /* Restarting syscalls requires that we stop them first */ - if (disable_first) - ftrace_set_clr_event(tr, token, 0); + if (*token) { + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } /* Put back the comma to allow this to be called again */ if (buf) @@ -2671,6 +3140,9 @@ static __init int event_trace_init(void) if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); @@ -2866,7 +3338,9 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static void +static struct trace_array *event_tr; + +static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { @@ -2897,7 +3371,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -2913,6 +3387,9 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; + event_tr = top_trace_array(); + if (WARN_ON(!event_tr)) + return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d81d6f302b14..f93a219b18da 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + int match = 0; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + match = cpu == cmp; + break; + case OP_LT: + match = cpu < cmp; + break; + case OP_LE: + match = cpu <= cmp; + break; + case OP_GT: + match = cpu > cmp; + break; + case OP_GE: + match = cpu >= cmp; + break; + default: + break; + } + + return !!match == !pred->not; +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp, match; + + cmp = pred->regex.match(current->comm, &pred->regex, + pred->regex.field_len); + match = cmp ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; @@ -929,15 +973,15 @@ static bool is_string_field(struct ftrace_event_field *field) field->filter_type == FILTER_PTR_STRING; } -static int is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) - return 0; + return false; if (!is_string_field(field) && op == OP_GLOB) - return 0; + return false; - return 1; + return true; } static filter_pred_fn_t select_comparison_fn(int op, int field_size, @@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps, if (is_string_field(field)) { filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) { + if (!strcmp(field->name, "comm")) { + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - fn = select_comparison_fn(pred->op, field->size, + if (!strcmp(field->name, "cpu")) + fn = filter_pred_cpu; + else + fn = select_comparison_fn(pred->op, field->size, field->is_signed); if (!fn) { parse_error(ps, FILT_ERR_INVALID_OP, 0); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index adabf7da9113..39aa7aa66468 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), filter, NULL) -int ftrace_event_is_function(struct trace_event_call *call) +bool ftrace_event_is_function(struct trace_event_call *call) { return call == &event_function; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..92382af7a213 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -83,13 +83,18 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, /* Display function name after trailing } */ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, + /* Include sleep time (scheduled out) between entry and return */ + { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + /* Include time within nested functions */ + { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | + TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, .opts = trace_opts }; @@ -107,8 +112,8 @@ enum { }; static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags); +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ int @@ -653,6 +658,7 @@ static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -660,7 +666,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); @@ -676,19 +682,19 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); } /* No overhead */ - print_graph_duration(0, s, flags | FLAGS_FILL_START); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) trace_seq_puts(s, "==========>"); else trace_seq_puts(s, "<=========="); - print_graph_duration(0, s, flags | FLAGS_FILL_END); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END); trace_seq_putc(s, '\n'); } @@ -715,22 +721,22 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } static void -print_graph_duration(unsigned long long duration, struct trace_seq *s, - u32 flags) +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags) { if (!(flags & TRACE_GRAPH_PRINT_DURATION) || - !(trace_flags & TRACE_ITER_CONTEXT_INFO)) + !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* No real adata, just filling the column with spaces */ @@ -764,6 +770,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct trace_seq *s, u32 flags) { struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; @@ -792,7 +799,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -810,6 +817,7 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; int i; if (data) { @@ -825,7 +833,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) @@ -849,6 +857,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; + struct trace_array *tr = iter->tr; int cpu = iter->cpu; /* Pid */ @@ -858,7 +867,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, /* Interrupt */ print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; /* Absolute time */ @@ -876,7 +885,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) print_graph_lat_fmt(s, ent); return; @@ -1027,6 +1036,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, { unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; @@ -1058,7 +1068,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - print_graph_duration(duration, s, flags); + print_graph_duration(tr, duration, s, flags); /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) @@ -1091,7 +1101,8 @@ static enum print_line_t print_graph_comment(struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; struct trace_event *event; int depth = 0; @@ -1104,7 +1115,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - print_graph_duration(0, s, flags | FLAGS_FILL_FULL); + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) @@ -1245,9 +1256,10 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s||| / \n", size, spaces); } -static void __print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct trace_array *tr, + struct seq_file *s, u32 flags) { - int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; if (lat) print_lat_header(s, flags); @@ -1289,11 +1301,12 @@ static void print_graph_headers(struct seq_file *s) void print_graph_headers_flags(struct seq_file *s, u32 flags) { struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; - if (trace_flags & TRACE_ITER_LATENCY_FMT) { + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { /* print nothing if the buffers are empty */ if (trace_empty(iter)) return; @@ -1301,7 +1314,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) print_trace_header(s, iter); } - __print_graph_headers_flags(s, flags); + __print_graph_headers_flags(tr, s, flags); } void graph_trace_open(struct trace_iterator *iter) @@ -1362,6 +1375,12 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; + if (bit == TRACE_GRAPH_SLEEP_TIME) + ftrace_graph_sleep_time_control(set); + + if (bit == TRACE_GRAPH_GRAPH_TIME) + ftrace_graph_graph_time_control(set); + return 0; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8523ea345f2b..e4e56589ec1d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -31,7 +31,6 @@ enum { static int trace_type __read_mostly; static int save_flags; -static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -57,22 +56,16 @@ irq_trace(void) # define irq_trace() (0) #endif -#define TRACE_DISPLAY_GRAPH 1 - -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int irqsoff_display_graph(struct trace_array *tr, int set); +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +static inline int irqsoff_display_graph(struct trace_array *tr, int set) +{ + return -EINVAL; +} +# define is_graph(tr) false #endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; - -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) /* * Sequence count - we record it when starting a measurement and @@ -152,15 +145,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int irqsoff_display_graph(struct trace_array *tr, int set) { int cpu; - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_irqsoff_tracer(irqsoff_trace, !set); @@ -209,7 +198,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) static void irqsoff_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -231,7 +220,7 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -239,7 +228,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) + struct trace_array *tr = irqsoff_trace; + + if (is_graph(tr)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -250,7 +241,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -259,27 +250,23 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - +#ifdef CONFIG_FUNCTION_TRACER static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) { return -1; } +#endif static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } static void irqsoff_print_header(struct seq_file *s) { trace_default_header(s); @@ -295,16 +282,16 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void @@ -523,12 +510,15 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_FUNCTION_TRACER +static bool function_enabled; + static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -556,20 +546,40 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph) function_enabled = false; } -static void irqsoff_function_set(struct trace_array *tr, int set) +static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) - register_irqsoff_function(tr, is_graph(), 1); + register_irqsoff_function(tr, is_graph(tr), 1); else - unregister_irqsoff_function(tr, is_graph()); + unregister_irqsoff_function(tr, is_graph(tr)); + return 1; +} +#else +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { } +static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; } +#endif /* CONFIG_FUNCTION_TRACER */ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(tr, set); + if (irqsoff_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return irqsoff_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } @@ -602,7 +612,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) if (irqsoff_busy) return -EBUSY; - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); @@ -618,7 +628,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) /* Only toplevel instance supports graph tracing */ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && - is_graph()))) + is_graph(tr)))) printk(KERN_ERR "failed to start irqsoff tracer\n"); irqsoff_busy = true; @@ -630,7 +640,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr) int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; - stop_irqsoff_tracer(tr, is_graph()); + stop_irqsoff_tracer(tr, is_graph(tr)); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); @@ -666,8 +676,6 @@ static struct tracer irqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -700,8 +708,6 @@ static struct tracer preemptoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -736,8 +742,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, - .flags = &tracer_flags, - .set_flag = irqsoff_set_flag, .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3ccf5c2c1320..57149bce6aad 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -21,20 +21,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; + tr = iter.tr; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - old_userobj = trace_flags; + old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); @@ -82,7 +84,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); out: - trace_flags = old_userobj; + tr->trace_flags = old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 638e110c5bfd..2be8c4f2403d 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -314,7 +314,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry->rw = *rw; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -344,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry->map = *map; if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); + trace_buffer_unlock_commit(tr, buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..282982195e09 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -322,8 +322,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, # define IP_FMT "%016lx" #endif -int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) +static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) { struct file *file = NULL; unsigned long vmstart = 0; @@ -355,50 +355,6 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, } int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) - break; - - trace_seq_puts(s, " => "); - - if (!ip) { - trace_seq_puts(s, "??"); - trace_seq_putc(s, '\n'); - continue; - } - - seq_print_user_ip(s, mm, ip, sym_flags); - trace_seq_putc(s, '\n'); - } - - if (mm) - mmput(mm); - - return !trace_seq_has_overflowed(s); -} - -int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { if (!ip) { @@ -496,6 +452,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +466,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } @@ -518,7 +476,8 @@ char trace_find_mark(unsigned long long d) static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + struct trace_array *tr = iter->tr; + unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; @@ -561,6 +520,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) int trace_print_context(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; unsigned long long t; @@ -572,7 +532,7 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (trace_flags & TRACE_ITER_IRQ_INFO) + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { @@ -588,14 +548,15 @@ int trace_print_context(struct trace_iterator *iter) int trace_print_lat_context(struct trace_iterator *iter) { - u64 next_ts; + struct trace_array *tr = iter->tr; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; + u64 next_ts; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, &next_ts); - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); /* Restore the original ent_size */ iter->ent_size = ent_size; @@ -1077,13 +1038,49 @@ static struct trace_event trace_stack_event = { static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct userstack_entry *field; struct trace_seq *s = &iter->seq; + struct mm_struct *mm = NULL; + unsigned int i; trace_assign_type(field, iter->ent); trace_seq_puts(s, "<user stack trace>\n"); - seq_print_userip_objs(field, s, flags); + + if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(field->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = field->caller[i]; + + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + + if (!ip) { + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); + continue; + } + + seq_print_user_ip(s, mm, ip, flags); + trace_seq_putc(s, '\n'); + } + + if (mm) + mmput(mm); return trace_handle_return(s); } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 4cbfe85b99c8..fabc49bcd493 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -14,10 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern int seq_print_userip_objs(const struct userstack_entry *entry, - struct trace_seq *s, unsigned long sym_flags); -extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 36c1455b7567..1c2b28536feb 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -178,6 +178,12 @@ static inline void format_mod_start(void) { } static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ +static bool __read_mostly trace_printk_enabled = true; + +void trace_printk_control(bool enabled) +{ + trace_printk_enabled = enabled; +} __initdata_or_module static struct notifier_block module_trace_bprintk_format_nb = { @@ -192,7 +198,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -207,7 +213,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) if (unlikely(!fmt)) return 0; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vbprintk(ip, fmt, ap); @@ -219,7 +225,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; va_start(ap, fmt); @@ -231,7 +237,7 @@ EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) { - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!trace_printk_enabled) return 0; return trace_vprintk(ip, fmt, ap); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b98dee914542..f6398db09114 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -302,15 +302,15 @@ static nokprobe_inline void call_fetch(struct fetch_param *fprm, } /* Check the name is good for event/group/fields */ -static inline int is_good_name(const char *name) +static inline bool is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') - return 0; + return false; while (*++name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; + return false; } - return 1; + return true; } static inline struct event_file_link * diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 419ca37e72c9..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,7 +16,8 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static void -probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next) { if (unlikely(!sched_ref)) return; @@ -26,7 +27,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n } static void -probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee) { if (unlikely(!sched_ref)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9b33dd117f3f..9d4399b553a3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -34,31 +34,28 @@ static arch_spinlock_t wakeup_lock = static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); -static int wakeup_graph_entry(struct ftrace_graph_ent *trace); -static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; -static bool function_enabled; - -#define TRACE_DISPLAY_GRAPH 1 -static struct tracer_opt trace_opts[] = { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* display latency trace as call graph */ - { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +static int wakeup_display_graph(struct trace_array *tr, int set); +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +static inline int wakeup_display_graph(struct trace_array *tr, int set) +{ + return 0; +} +# define is_graph(tr) false #endif - { } /* Empty entry */ -}; - -static struct tracer_flags tracer_flags = { - .val = 0, - .opts = trace_opts, -}; -#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); + +static bool function_enabled; + /* * Prologue for the wakeup function tracers. * @@ -128,14 +125,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } -#endif /* CONFIG_FUNCTION_TRACER */ static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ - if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) return 0; if (graph) @@ -163,20 +159,40 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph) function_enabled = false; } -static void wakeup_function_set(struct trace_array *tr, int set) +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) { + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + if (set) - register_wakeup_function(tr, is_graph(), 1); + register_wakeup_function(tr, is_graph(tr), 1); else - unregister_wakeup_function(tr, is_graph()); + unregister_wakeup_function(tr, is_graph(tr)); + return 1; +} +#else +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; } +#endif /* CONFIG_FUNCTION_TRACER */ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { struct tracer *tracer = tr->current_trace; - if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(tr, set); + if (wakeup_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); +#endif return trace_keep_overwrite(tracer, mask, set); } @@ -203,14 +219,9 @@ static void stop_func_tracer(struct trace_array *tr, int graph) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +static int wakeup_display_graph(struct trace_array *tr, int set) { - - if (!(bit & TRACE_DISPLAY_GRAPH)) - return -EINVAL; - - if (!(is_graph() ^ set)) + if (!(is_graph(tr) ^ set)) return 0; stop_func_tracer(tr, !set); @@ -259,7 +270,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) static void wakeup_trace_open(struct trace_iterator *iter) { - if (is_graph()) + if (is_graph(iter->tr)) graph_trace_open(iter); } @@ -279,7 +290,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ - if (is_graph()) + if (is_graph(iter->tr)) return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; @@ -287,7 +298,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) static void wakeup_print_header(struct seq_file *s) { - if (is_graph()) + if (is_graph(wakeup_trace)) print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); else trace_default_header(s); @@ -298,7 +309,7 @@ __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (is_graph()) + if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, flags, pc); else trace_function(tr, ip, parent_ip, flags, pc); @@ -306,27 +317,20 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int -wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) -{ - return -EINVAL; -} - -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) -{ - return -1; -} - static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } -static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } #ifdef CONFIG_FUNCTION_TRACER +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } static void wakeup_print_header(struct seq_file *s) { trace_default_header(s); @@ -342,16 +346,16 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(struct trace_array *tr, cycle_t delta) +static bool report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) - return 0; + return false; } else { if (delta <= tr->max_latency) - return 0; + return false; } - return 1; + return true; } static void @@ -388,7 +392,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void @@ -416,11 +420,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(tr, buffer, event, flags, pc); } static void notrace -probe_wakeup_sched_switch(void *ignore, +probe_wakeup_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; @@ -514,7 +518,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(void *ignore, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p) { struct trace_array_cpu *data; int cpu = smp_processor_id(); @@ -635,7 +639,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(tr, is_graph())) + if (start_func_tracer(tr, is_graph(tr))) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -648,7 +652,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(tr, is_graph()); + stop_func_tracer(tr, is_graph(tr)); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -659,7 +663,7 @@ static bool wakeup_busy; static int __wakeup_tracer_init(struct trace_array *tr) { - save_flags = trace_flags; + save_flags = tr->trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); @@ -740,8 +744,6 @@ static struct tracer wakeup_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -762,8 +764,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, @@ -784,8 +784,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, - .flags = &tracer_flags, - .set_flag = wakeup_set_flag, .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..dda9e6742950 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -16,30 +16,22 @@ #include "trace.h" -#define STACK_TRACE_ENTRIES 500 - -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +unsigned stack_trace_index[STACK_TRACE_ENTRIES]; /* * Reserve one entry for the passed in ip. This will allow * us to remove most or all of the stack size overhead * added by the stack tracer itself. */ -static struct stack_trace max_stack_trace = { +struct stack_trace stack_trace_max = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; -static unsigned long max_stack_size; -static arch_spinlock_t max_stack_lock = +unsigned long stack_trace_max_size; +arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static DEFINE_PER_CPU(int, trace_active); @@ -48,83 +40,95 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void print_max_stack(void) +void stack_trace_print(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + stack_trace_max.nr_entries); - for (i = 0; i < max_stack_trace.nr_entries; i++) { + for (i = 0; i < stack_trace_max.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i], size, (void *)stack_dump_trace[i]); } } -static inline void +/* + * When arch-specific code overides this function, the following + * data should be filled up, assuming stack_trace_max_lock is held to + * prevent concurrent updates. + * stack_trace_index[] + * stack_trace_max + * stack_trace_max_size + */ +void __weak check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; /* Remove the frame of the tracer */ this_size -= frame_size; - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) return; /* we do not handle interrupt stacks yet */ if (!object_is_on_stack(stack)) return; + /* Can't do this from NMI context (can cause deadlocks) */ + if (in_nmi()) + return; + local_irq_save(flags); - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); + + /* + * RCU may not be watching, make it see us. + * The stack trace code uses rcu_sched. + */ + rcu_irq_enter(); /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; /* a race could have already updated it */ - if (this_size <= max_stack_size) + if (this_size <= stack_trace_max_size) goto out; - max_stack_size = this_size; + stack_trace_max_size = this_size; - max_stack_trace.nr_entries = 0; + stack_trace_max.nr_entries = 0; + stack_trace_max.skip = 3; - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + save_stack_trace(&stack_trace_max); - save_stack_trace(&max_stack_trace); - - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < stack_trace_max.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -136,15 +140,18 @@ check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < max_stack_trace.nr_entries) { + while (i < stack_trace_max.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_trace_index[x] = this_size; p = start; - for (; p < top && i < max_stack_trace.nr_entries; p++) { + for (; p < top && i < stack_trace_max.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,10 +163,10 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); - max_stack_size -= tracer_frame; + stack_trace_max_size -= tracer_frame; } } } @@ -168,13 +175,18 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + stack_trace_max.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { - print_max_stack(); + stack_trace_print(); BUG(); } out: - arch_spin_unlock(&max_stack_lock); + rcu_irq_exit(); + arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } @@ -192,24 +204,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -262,9 +257,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); *ptr = val; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); per_cpu(trace_active, cpu)--; local_irq_restore(flags); @@ -284,7 +279,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -307,7 +302,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) cpu = smp_processor_id(); per_cpu(trace_active, cpu)++; - arch_spin_lock(&max_stack_lock); + arch_spin_lock(&stack_trace_max_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -319,7 +314,7 @@ static void t_stop(struct seq_file *m, void *p) { int cpu; - arch_spin_unlock(&max_stack_lock); + arch_spin_unlock(&stack_trace_max_lock); cpu = smp_processor_id(); per_cpu(trace_active, cpu)--; @@ -354,9 +349,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + stack_trace_max.nr_entries); - if (!stack_tracer_enabled && !max_stack_size) + if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); return 0; @@ -364,17 +359,17 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= max_stack_trace.nr_entries || + if (i >= stack_trace_max.nr_entries || stack_dump_trace[i] == ULONG_MAX) return 0; - if (i+1 == max_stack_trace.nr_entries || + if (i+1 == stack_trace_max.nr_entries || stack_dump_trace[i+1] == ULONG_MAX) - size = stack_dump_index[i]; + size = stack_trace_index[i]; else - size = stack_dump_index[i] - stack_dump_index[i+1]; + size = stack_trace_index[i] - stack_trace_index[i+1]; - seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size); trace_lookup_stack(m, i); @@ -464,7 +459,7 @@ static __init int stack_trace_init(void) return 0; trace_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); + &stack_trace_max_size, &stack_max_size_fops); trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7d567a4b9fa7..0655afbea83f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -110,6 +110,7 @@ static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; @@ -136,7 +137,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) + if (tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3490407dc7b7..ecd536de603a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -91,11 +91,13 @@ static void debug_print_probes(struct tracepoint_func *funcs) printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func *func_add(struct tracepoint_func **funcs, - struct tracepoint_func *tp_func) +static struct tracepoint_func * +func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, + int prio) { - int nr_probes = 0; struct tracepoint_func *old, *new; + int nr_probes = 0; + int pos = -1; if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); @@ -104,18 +106,33 @@ static struct tracepoint_func *func_add(struct tracepoint_func **funcs, old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + /* Insert before probes of lower priority */ + if (pos < 0 && old[nr_probes].prio < prio) + pos = nr_probes; if (old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); + } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); - if (old) - memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes] = *tp_func; + if (old) { + if (pos < 0) { + pos = nr_probes; + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + } else { + /* Copy higher priority probes ahead of the new probe */ + memcpy(new, old, pos * sizeof(struct tracepoint_func)); + /* Copy the rest after it. */ + memcpy(new + pos + 1, old + pos, + (nr_probes - pos) * sizeof(struct tracepoint_func)); + } + } else + pos = 0; + new[pos] = *tp_func; new[nr_probes + 1].func = NULL; *funcs = new; debug_print_probes(*funcs); @@ -174,7 +191,7 @@ static void *func_remove(struct tracepoint_func **funcs, * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, - struct tracepoint_func *func) + struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; @@ -183,7 +200,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); - old = func_add(&tp_funcs, func); + old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(1); return PTR_ERR(old); @@ -240,6 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * @tp: tracepoint * @probe: probe handler * @data: tracepoint data + * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -247,7 +265,8 @@ static int tracepoint_remove_func(struct tracepoint *tp, * performed either with a tracepoint module going notifier, or from * within module exit functions. */ -int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, + void *data, int prio) { struct tracepoint_func tp_func; int ret; @@ -255,10 +274,30 @@ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; - ret = tracepoint_add_func(tp, &tp_func); + tp_func.prio = prio; + ret = tracepoint_add_func(tp, &tp_func, prio); mutex_unlock(&tracepoints_mutex); return ret; } +EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @tp: tracepoint + * @probe: probe handler + * @data: tracepoint data + * @prio: priority of this function over other registered functions + * + * Returns 0 if ok, error value on error. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. + */ +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) +{ + return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); +} EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4109f8320684..88fefa68c516 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; + cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); @@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (user_ns == current_user_ns()) return -EINVAL; - /* Threaded processes may not enter a different user namespace */ - if (atomic_read(¤t->mm->mm_users) > 1) + /* Tasks that share a thread group must share a user namespace */ + if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a6ffa43f2993..18f34cf75f74 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> #include <linux/perf_event.h> +#include <linux/kthread.h> /* * The run state of the lockup detectors is controlled by the content of the @@ -56,8 +57,10 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #else #define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -66,7 +69,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -90,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn; * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic = +unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -153,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif /* @@ -243,15 +273,15 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static int is_hardlockup(void) +static bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return 1; + return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); - return 0; + return false; } #endif @@ -259,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); - if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { + if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; @@ -298,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); + struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) return; - if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", - this_cpu); + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + panic("Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; @@ -327,6 +370,9 @@ static void watchdog_interrupt_count(void) static int watchdog_nmi_enable(unsigned int cpu); static void watchdog_nmi_disable(unsigned int cpu); +static int watchdog_enable_all_cpus(void); +static void watchdog_disable_all_cpus(void); + /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -613,46 +659,9 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { @@ -666,46 +675,107 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static void restart_watchdog_hrtimer(void *info) +/* + * park all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function returns an error if kthread_park() of a watchdog thread + * fails. In this situation, the watchdog threads of some CPUs can already + * be parked and the watchdog threads of other CPUs can still be runnable. + * Callers are expected to handle this special condition as appropriate in + * their context. + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). + */ +static int watchdog_park_threads(void) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); - int ret; + int cpu, ret = 0; + for_each_watchdog_cpu(cpu) { + ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (ret) + break; + } + + return ret; +} + +/* + * unpark all watchdog threads that are specified in 'watchdog_cpumask' + * + * This function may only be called in a context that is protected against + * races with CPU hotplug - for example, via get_online_cpus(). + */ +static void watchdog_unpark_threads(void) +{ + int cpu; + + for_each_watchdog_cpu(cpu) + kthread_unpark(per_cpu(softlockup_watchdog, cpu)); +} + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); /* - * No need to cancel and restart hrtimer if it is currently executing - * because it will reprogram itself with the new period now. - * We should never see it unqueued here because we are running per-cpu - * with interrupts disabled. + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). */ - ret = hrtimer_try_to_cancel(hrtimer); - if (ret == 1) - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; } -static void update_watchdog(int cpu) +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) { + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; /* - * Make sure that perf event counter will adopt to a new - * sampling period. Updating the sampling period directly would - * be much nicer but we do not have an API for that now so - * let's use a big hammer. - * Hrtimer will adopt the new period on the next tick but this - * might be late already so we have to restart the timer as well. + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. */ - watchdog_nmi_disable(cpu); - smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); - watchdog_nmi_enable(cpu); + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); } -static void update_watchdog_all_cpus(void) +static int update_watchdog_all_cpus(void) { - int cpu; + int ret; - get_online_cpus(); - for_each_watchdog_cpu(cpu) - update_watchdog(cpu); - put_online_cpus(); + ret = watchdog_park_threads(); + if (ret) + return ret; + + watchdog_unpark_threads(); + + return 0; } static int watchdog_enable_all_cpus(void) @@ -713,29 +783,31 @@ static int watchdog_enable_all_cpus(void) int err = 0; if (!watchdog_running) { - err = smpboot_register_percpu_thread(&watchdog_threads); + err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_cpumask); if (err) pr_err("Failed to create watchdog threads, disabled\n"); - else { - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask)) - pr_err("Failed to set cpumask for watchdog threads\n"); + else watchdog_running = 1; - } } else { /* * Enable/disable the lockup detectors or * change the sample period 'on the fly'. */ - update_watchdog_all_cpus(); + err = update_watchdog_all_cpus(); + + if (err) { + watchdog_disable_all_cpus(); + pr_err("Failed to update lockup detectors, disabled\n"); + } } + if (err) + watchdog_enabled = 0; + return err; } -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { if (watchdog_running) { @@ -744,6 +816,8 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL + /* * Update the run state of the lockup detectors. */ @@ -785,8 +859,15 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, int err, old, new; int *watchdog_param = (int *)table->data; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + /* * If the parameter is being read return the state of the corresponding * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the @@ -820,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, } while (cmpxchg(&watchdog_enabled, old, new) != old); /* - * Update the run state of the lockup detectors. - * Restore 'watchdog_enabled' on failure. + * Update the run state of the lockup detectors. There is _no_ + * need to check the value returned by proc_watchdog_update() + * and to restore the previous value of 'watchdog_enabled' as + * both lockup detectors are disabled if proc_watchdog_update() + * returns an error. */ err = proc_watchdog_update(); - if (err) - watchdog_enabled = old; } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -870,8 +953,15 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, { int err, old; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + old = ACCESS_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -879,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, goto out; /* - * Update the sample period. - * Restore 'watchdog_thresh' on failure. + * Update the sample period. Restore on failure. */ set_sample_period(); err = proc_watchdog_update(); - if (err) + if (err) { watchdog_thresh = old; + set_sample_period(); + } out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -902,7 +994,15 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; + get_online_cpus(); mutex_lock(&watchdog_proc_mutex); + + if (watchdog_suspended) { + /* no parameter changes allowed while watchdog is suspended */ + err = -EAGAIN; + goto out; + } + err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) { /* Remove impossible cpus to keep sysctl output cleaner. */ @@ -920,7 +1020,9 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, pr_err("cpumask update failed\n"); } } +out: mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); return err; } @@ -932,10 +1034,8 @@ void __init lockup_detector_init(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { - if (!cpumask_empty(tick_nohz_full_mask)) - pr_info("Disabling watchdog on nohz_full cores by default\n"); - cpumask_andnot(&watchdog_cpumask, cpu_possible_mask, - tick_nohz_full_mask); + pr_info("Disabling watchdog on nohz_full cores by default\n"); + cpumask_copy(&watchdog_cpumask, housekeeping_mask); } else cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c4f06176f74..c579dbab2e36 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex) && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; + /* timer isn't guaranteed to run in this cpu, record earlier */ + if (cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) - add_timer_on(timer, cpu); - else - add_timer(timer); + add_timer_on(timer, cpu); } /** @@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -3201,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; int node; + int target_node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3212,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(attrs->cpumask, + wq_numa_possible_cpumask[node])) { + target_node = node; + break; + } + } + } + /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); if (!pool || init_worker_pool(pool) < 0) goto fail; lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + pool->node = target_node; /* * no_numa isn't a worker_pool attribute, always clear it. See @@ -3226,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) */ pool->attrs->no_numa = false; - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - if (worker_pool_assign_id(pool) < 0) goto fail; @@ -3856,7 +3856,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } |