From c4a8849af939082052d8117f9ea3e170a99ff232 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:42 +0200 Subject: sched: Remove obsolete arch_ prefixes Non weak static functions clearly are not arch specific, so remove the arch_ prefix. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.820460566@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 48013633d792..d3e183c85f49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serializes calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -7670,7 +7670,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) +static int init_sched_domains(const struct cpumask *cpu_map) { int err; @@ -7687,7 +7687,7 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) return err; } -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, +static void destroy_sched_domains(const struct cpumask *cpu_map, struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); @@ -7706,7 +7706,7 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7815,7 +7815,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) +static void reinit_sched_domains(void) { get_online_cpus(); @@ -7848,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - arch_reinit_sched_domains(); + reinit_sched_domains(); return count; } @@ -7974,7 +7974,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_active_mask); + init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -- cgit v1.2.3 From d274cb30f4a08045492d3f0c47cdf1a25668b1f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:43 +0200 Subject: sched: Simplify ->cpu_power initialization The code in update_group_power() does what init_sched_groups_power() does and more, so remove the special init_ code and call the generic code instead. Also move the sd->span_weight initialization because update_group_power() needs it. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.875856012@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 44 +++++--------------------------------------- 1 file changed, 5 insertions(+), 39 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d3e183c85f49..50d5fd33e8d5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6679,9 +6679,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - for (tmp = sd; tmp; tmp = tmp->parent) - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -7159,11 +7156,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - struct sched_domain *child; - struct sched_group *group; - long power; - int weight; - WARN_ON(!sd || !sd->groups); if (cpu != group_first_cpu(sd->groups)) @@ -7171,36 +7163,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); - child = sd->child; - - sd->groups->cpu_power = 0; - - if (!child) { - power = SCHED_LOAD_SCALE; - weight = cpumask_weight(sched_domain_span(sd)); - /* - * SMT siblings share the power of a single core. - * Usually multiple threads get a better yield out of - * that one core than a single thread would have, - * reflect that in sd->smt_gain. - */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= sd->smt_gain; - power /= weight; - power >>= SCHED_LOAD_SHIFT; - } - sd->groups->cpu_power += power; - return; - } - - /* - * Add cpu_power of each child group to this groups cpu_power. - */ - group = child->groups; - do { - sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); + update_group_power(sd, cpu); } /* @@ -7507,7 +7470,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, { enum s_alloc alloc_state = sa_none; struct s_data d; - struct sched_domain *sd; + struct sched_domain *sd, *tmp; int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; @@ -7530,6 +7493,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + + for (tmp = sd; tmp; tmp = tmp->parent) + tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); } for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From a06dadbec5c5df0bf3a35f33616f67d10ca9ba28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:44 +0200 Subject: sched: Simplify build_sched_groups() Notice that the mask being computed is the same as the domain span we just computed. By using the domain_span we can avoid some mask allocations and computations. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.925028189@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 50d5fd33e8d5..e3818f1b98fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6866,9 +6866,6 @@ struct s_data { cpumask_var_t notcovered; #endif cpumask_var_t nodemask; - cpumask_var_t this_sibling_map; - cpumask_var_t this_core_map; - cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6880,9 +6877,6 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, - sa_this_book_map, - sa_this_core_map, - sa_this_sibling_map, sa_nodemask, sa_sched_group_nodes, #ifdef CONFIG_NUMA @@ -7251,12 +7245,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ - case sa_this_book_map: - free_cpumask_var(d->this_book_map); /* fall through */ - case sa_this_core_map: - free_cpumask_var(d->this_core_map); /* fall through */ - case sa_this_sibling_map: - free_cpumask_var(d->this_sibling_map); /* fall through */ case sa_nodemask: free_cpumask_var(d->nodemask); /* fall through */ case sa_sched_group_nodes: @@ -7295,14 +7283,8 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, #endif if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) return sa_sched_group_nodes; - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) - return sa_nodemask; - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) - return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) - return sa_this_core_map; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_this_book_map; + return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -7414,39 +7396,40 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, static void build_sched_groups(struct s_data *d, enum sched_domain_level l, const struct cpumask *cpu_map, int cpu) { + struct sched_domain *sd; + switch (l) { #ifdef CONFIG_SCHED_SMT case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - cpumask_and(d->this_sibling_map, cpu_map, - topology_thread_cpumask(cpu)); - if (cpu == cpumask_first(d->this_sibling_map)) - init_sched_build_groups(d->this_sibling_map, cpu_map, + sd = &per_cpu(cpu_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_cpu_group, d->send_covered, d->tmpmask); break; #endif #ifdef CONFIG_SCHED_MC case SD_LV_MC: /* set up multi-core groups */ - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); - if (cpu == cpumask_first(d->this_core_map)) - init_sched_build_groups(d->this_core_map, cpu_map, + sd = &per_cpu(core_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_core_group, d->send_covered, d->tmpmask); break; #endif #ifdef CONFIG_SCHED_BOOK case SD_LV_BOOK: /* set up book groups */ - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); - if (cpu == cpumask_first(d->this_book_map)) - init_sched_build_groups(d->this_book_map, cpu_map, + sd = &per_cpu(book_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_book_group, d->send_covered, d->tmpmask); break; #endif case SD_LV_CPU: /* set up physical groups */ - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); - if (!cpumask_empty(d->nodemask)) - init_sched_build_groups(d->nodemask, cpu_map, + sd = &per_cpu(phys_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_phys_group, d->send_covered, d->tmpmask); break; @@ -7502,11 +7485,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); - } - - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + } #ifdef CONFIG_NUMA /* Set up node groups */ -- cgit v1.2.3 From cd4ea6ae3982f6861da3b510e69cbc194f331d83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:45 +0200 Subject: sched: Change NODE sched_domain group creation The NODE sched_domain is 'special' in that it allocates sched_groups per CPU, instead of sharing the sched_groups between all CPUs. While this might have some benefits on large NUMA and avoid remote memory accesses when iterating the sched_groups, this does break current code that assumes sched_groups are shared between all sched_domains (since the dynamic cpu_power patches). So refactor the NODE groups to behave like all other groups. (The ALLNODES domain again shared its groups across the CPUs for some reason). If someone does measure a performance decrease due to this change we need to revisit this and come up with another way to have both dynamic cpu_power and NUMA work nice together. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.978111700@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 229 ++++++++------------------------------------------------- 1 file changed, 32 insertions(+), 197 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e3818f1b98fe..72d561fa67b7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6861,29 +6861,18 @@ struct static_sched_domain { struct s_data { #ifdef CONFIG_NUMA int sd_allnodes; - cpumask_var_t domainspan; - cpumask_var_t covered; - cpumask_var_t notcovered; #endif cpumask_var_t nodemask; cpumask_var_t send_covered; cpumask_var_t tmpmask; - struct sched_group **sched_group_nodes; struct root_domain *rd; }; enum s_alloc { - sa_sched_groups = 0, sa_rootdomain, sa_tmpmask, sa_send_covered, sa_nodemask, - sa_sched_group_nodes, -#ifdef CONFIG_NUMA - sa_notcovered, - sa_covered, - sa_domainspan, -#endif sa_none, }; @@ -6979,18 +6968,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, } #ifdef CONFIG_NUMA -/* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. - */ static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_node); -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, +static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *nodemask) { @@ -7000,142 +6981,27 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; + *sg = &per_cpu(sched_group_node, group).sg; return group; } -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != group_first_cpu(sd->groups)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - } while (sg != group_head); -} +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int build_numa_sched_groups(struct s_data *d, - const struct cpumask *cpu_map, int num) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { - struct sched_domain *sd; - struct sched_group *sg, *prev; - int n, j; - - cpumask_clear(d->covered); - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); - if (cpumask_empty(d->nodemask)) { - d->sched_group_nodes[num] = NULL; - goto out; - } - - sched_domain_node_span(num, d->domainspan); - cpumask_and(d->domainspan, d->domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); - return -ENOMEM; - } - d->sched_group_nodes[num] = sg; - - for_each_cpu(j, d->nodemask) { - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } + int group; - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->nodemask); - sg->next = sg; - cpumask_or(d->covered, d->covered, d->nodemask); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); + group = cpumask_first(nodemask); - prev = sg; - for (j = 0; j < nr_node_ids; j++) { - n = (num + j) % nr_node_ids; - cpumask_complement(d->notcovered, d->covered); - cpumask_and(d->tmpmask, d->notcovered, cpu_map); - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); - if (cpumask_empty(d->tmpmask)) - break; - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d->tmpmask)) - continue; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - return -ENOMEM; - } - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->tmpmask); - sg->next = prev->next; - cpumask_or(d->covered, d->covered, d->tmpmask); - prev->next = sg; - prev = sg; - } -out: - return 0; + if (sg) + *sg = &per_cpu(sched_group_allnodes, group).sg; + return group; } -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (!sched_group_nodes) - continue; - - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -} -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} #endif /* CONFIG_NUMA */ /* @@ -7236,9 +7102,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { switch (what) { - case sa_sched_groups: - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ - d->sched_group_nodes = NULL; case sa_rootdomain: free_rootdomain(d->rd); /* fall through */ case sa_tmpmask: @@ -7247,16 +7110,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->send_covered); /* fall through */ case sa_nodemask: free_cpumask_var(d->nodemask); /* fall through */ - case sa_sched_group_nodes: -#ifdef CONFIG_NUMA - kfree(d->sched_group_nodes); /* fall through */ - case sa_notcovered: - free_cpumask_var(d->notcovered); /* fall through */ - case sa_covered: - free_cpumask_var(d->covered); /* fall through */ - case sa_domainspan: - free_cpumask_var(d->domainspan); /* fall through */ -#endif case sa_none: break; } @@ -7265,24 +7118,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { -#ifdef CONFIG_NUMA - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) - return sa_none; - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) - return sa_domainspan; - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) - return sa_covered; - /* Allocate the per-node list of sched groups */ - d->sched_group_nodes = kcalloc(nr_node_ids, - sizeof(struct sched_group *), GFP_KERNEL); - if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return sa_notcovered; - } - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; -#endif if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_sched_group_nodes; + return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) @@ -7322,6 +7159,7 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, if (parent) parent->child = sd; cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); + cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7434,6 +7272,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #ifdef CONFIG_NUMA + case SD_LV_NODE: + sd = &per_cpu(node_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, + &cpu_to_node_group, + d->send_covered, d->tmpmask); + case SD_LV_ALLNODES: init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, d->send_covered, d->tmpmask); @@ -7462,7 +7307,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. @@ -7486,16 +7330,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + build_sched_groups(&d, SD_LV_NODE, cpu_map, i); } #ifdef CONFIG_NUMA /* Set up node groups */ if (d.sd_allnodes) build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - - for (i = 0; i < nr_node_ids; i++) - if (build_numa_sched_groups(&d, cpu_map, i)) - goto error; #endif /* Calculate CPU power for physical packages and nodes */ @@ -7524,15 +7365,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(d.sched_group_nodes[i]); + for_each_cpu(i, cpu_map) { + sd = &per_cpu(node_domains, i).sd; + init_sched_groups_power(i, sd); + } if (d.sd_allnodes) { - struct sched_group *sg; - - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - d.tmpmask); - init_numa_sched_groups_power(sg); + for_each_cpu(i, cpu_map) { + sd = &per_cpu(allnodes_domains, i).sd; + init_sched_groups_power(i, sd); + } } #endif @@ -7550,7 +7392,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_attach_domain(sd, d.rd, i); } - d.sched_group_nodes = NULL; /* don't free this we still need it */ __free_domain_allocs(&d, sa_tmpmask, cpu_map); return 0; @@ -7636,7 +7477,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) static void destroy_sched_domains(const struct cpumask *cpu_map, struct cpumask *tmpmask) { - free_sched_groups(cpu_map, tmpmask); } /* @@ -7913,11 +7753,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); -- cgit v1.2.3 From 3739494e08da50c8a68d65eed5ba3012a54b40d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:46 +0200 Subject: sched: Clean up some ALLNODES code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.025636011@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 72d561fa67b7..fa10cf73c80c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7280,7 +7280,9 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); case SD_LV_ALLNODES: - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + if (cpu == cpumask_first(cpu_map)) + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, d->send_covered, d->tmpmask); break; #endif @@ -7331,14 +7333,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_MC, cpu_map, i); build_sched_groups(&d, SD_LV_CPU, cpu_map, i); build_sched_groups(&d, SD_LV_NODE, cpu_map, i); + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i); } -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (d.sd_allnodes) - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); -#endif - /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From 1cf51902546d60b8a7a6aba2dd557bd4ba8840ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:47 +0200 Subject: sched: Simplify sched_group creation Instead of calling build_sched_groups() for each possible sched_domain we might have created, note that we can simply iterate the sched_domain tree and call it for each sched_domain present. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.077862519@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fa10cf73c80c..e66d24aaf6d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7231,15 +7231,12 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } -static void build_sched_groups(struct s_data *d, enum sched_domain_level l, +static void build_sched_groups(struct s_data *d, struct sched_domain *sd, const struct cpumask *cpu_map, int cpu) { - struct sched_domain *sd; - - switch (l) { + switch (sd->level) { #ifdef CONFIG_SCHED_SMT case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - sd = &per_cpu(cpu_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_cpu_group, @@ -7248,7 +7245,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, #endif #ifdef CONFIG_SCHED_MC case SD_LV_MC: /* set up multi-core groups */ - sd = &per_cpu(core_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_core_group, @@ -7257,7 +7253,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, #endif #ifdef CONFIG_SCHED_BOOK case SD_LV_BOOK: /* set up book groups */ - sd = &per_cpu(book_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_book_group, @@ -7265,7 +7260,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, break; #endif case SD_LV_CPU: /* set up physical groups */ - sd = &per_cpu(phys_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_phys_group, @@ -7273,7 +7267,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, break; #ifdef CONFIG_NUMA case SD_LV_NODE: - sd = &per_cpu(node_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_node_group, @@ -7323,17 +7316,10 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); - for (tmp = sd; tmp; tmp = tmp->parent) + for (tmp = sd; tmp; tmp = tmp->parent) { tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - } - - for_each_cpu(i, cpu_map) { - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); - build_sched_groups(&d, SD_LV_MC, cpu_map, i); - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); - build_sched_groups(&d, SD_LV_NODE, cpu_map, i); - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i); + build_sched_groups(&d, tmp, cpu_map, i); + } } /* Calculate CPU power for physical packages and nodes */ -- cgit v1.2.3 From 21d42ccfd6c6c11f96c2acfd32a85cfc33514d3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:48 +0200 Subject: sched: Simplify finding the lowest sched_domain Instead of relying on knowing the build order and various CONFIG_ flags simply remember the bottom most sched_domain when we created the domain hierarchy. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.134511046@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e66d24aaf6d1..d6992bfa11eb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6865,11 +6865,13 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t send_covered; cpumask_var_t tmpmask; + struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, + sa_sd, sa_tmpmask, sa_send_covered, sa_nodemask, @@ -7104,6 +7106,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, switch (what) { case sa_rootdomain: free_rootdomain(d->rd); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ case sa_tmpmask: free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: @@ -7124,10 +7128,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) { + printk(KERN_WARNING "Cannot alloc per-cpu pointers\n"); + return sa_tmpmask; + } d->rd = alloc_rootdomain(); if (!d->rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); - return sa_tmpmask; + return sa_sd; } return sa_rootdomain; } @@ -7316,6 +7325,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + *per_cpu_ptr(d.sd, i) = sd; + for (tmp = sd; tmp; tmp = tmp->parent) { tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); build_sched_groups(&d, tmp, cpu_map, i); @@ -7363,15 +7374,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ for_each_cpu(i, cpu_map) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#elif defined(CONFIG_SCHED_BOOK) - sd = &per_cpu(book_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif + sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } -- cgit v1.2.3 From a9c9a9b6bff27ac9c746344a9c1a19bf3327002c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:49 +0200 Subject: sched: Simplify sched_groups_power initialization Again, instead of relying on knowing the possible domains and their order, simply rely on the sched_domain tree and whatever domains are present in there to initialize the sched_group cpu_power. Note: we need to iterate the CPU mask backwards because of the cpumask_first() condition for iterating up the tree. By iterating the mask backwards we ensure all groups of a domain are set-up before starting on the parent groups that rely on its children to be completely done. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.187335414@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d6992bfa11eb..1cca59ec4a49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7334,43 +7334,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - sd = &per_cpu(cpu_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - sd = &per_cpu(core_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_BOOK - for_each_cpu(i, cpu_map) { - sd = &per_cpu(book_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif - - for_each_cpu(i, cpu_map) { - sd = &per_cpu(phys_domains, i).sd; - init_sched_groups_power(i, sd); - } - -#ifdef CONFIG_NUMA - for_each_cpu(i, cpu_map) { - sd = &per_cpu(node_domains, i).sd; - init_sched_groups_power(i, sd); - } + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; - if (d.sd_allnodes) { - for_each_cpu(i, cpu_map) { - sd = &per_cpu(allnodes_domains, i).sd; + sd = *per_cpu_ptr(d.sd, i); + for (; sd; sd = sd->parent) init_sched_groups_power(i, sd); - } } -#endif /* Attach the domains */ for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:50 +0200 Subject: sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 479 +++++++++++++++++++++++---------------------------------- 1 file changed, 189 insertions(+), 290 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1cca59ec4a49..65204845063e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -417,6 +417,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -571,7 +572,7 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_sched_held() || \ + rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -6572,12 +6573,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -static void free_rootdomain(struct root_domain *rd) +static void free_rootdomain(struct rcu_head *rcu) { - synchronize_sched(); + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6618,7 +6618,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - free_rootdomain(old_rd); + call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -6669,6 +6669,25 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + if (atomic_dec_and_test(&sd->groups->ref)) + kfree(sd->groups); + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -6689,20 +6708,25 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { + tmp = sd; sd = sd->parent; + destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } - sched_domain_debug(sd, cpu); + /* sched_domain_debug(sd, cpu); */ rq_attach_root(rq, rd); + tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); } /* cpus with isolated domains */ @@ -6718,56 +6742,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA @@ -6858,154 +6832,96 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; +}; + struct s_data { #ifdef CONFIG_NUMA int sd_allnodes; #endif cpumask_var_t nodemask; cpumask_var_t send_covered; - cpumask_var_t tmpmask; struct sched_domain ** __percpu sd; + struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, sa_sd, - sa_tmpmask, + sa_sd_storage, sa_send_covered, sa_nodemask, sa_none, }; /* - * SMT sched-domains: + * Assumes the sched_domain tree is fully constructed */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_groups); - -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { - if (sg) - *sg = &per_cpu(sched_groups, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); + if (child) + cpu = cpumask_first(sched_domain_span(child)); -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_SMT - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; + *sg = *per_cpu_ptr(sdd->sg, cpu); + + return cpu; } -#endif /* CONFIG_SCHED_MC */ /* - * book sched-domains: + * build_sched_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. */ -#ifdef CONFIG_SCHED_BOOK -static DEFINE_PER_CPU(struct static_sched_domain, book_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); - -static int -cpu_to_book_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group = cpu; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#endif - if (sg) - *sg = &per_cpu(sched_group_book, group).sg; - return group; -} -#endif /* CONFIG_SCHED_BOOK */ - -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); - -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) +static void +build_sched_groups(struct sched_domain *sd, struct cpumask *covered) { - int group; -#ifdef CONFIG_SCHED_BOOK - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_MC) - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; -} - -#ifdef CONFIG_NUMA -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_node); + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + int i; -static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; + cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; - if (sg) - *sg = &per_cpu(sched_group_node, group).sg; - return group; -} + if (cpumask_test_cpu(i, covered)) + continue; -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); + cpumask_clear(sched_group_cpus(sg)); + sg->cpu_power = 0; -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) + continue; - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; } -#endif /* CONFIG_NUMA */ - /* * Initialize sched groups cpu_power. * @@ -7039,15 +6955,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain *sd_init_##type(struct s_data *d, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(d->sdd[SD_LV_##type].sd, cpu); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ + sd->private = &d->sdd[SD_LV_##type]; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7103,13 +7019,22 @@ static void set_domain_attribute(struct sched_domain *sd, static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { + int i, j; + switch (what) { case sa_rootdomain: - free_rootdomain(d->rd); /* fall through */ + free_rootdomain(&d->rd->rcu); /* fall through */ case sa_sd: free_percpu(d->sd); /* fall through */ - case sa_tmpmask: - free_cpumask_var(d->tmpmask); /* fall through */ + case sa_sd_storage: + for (i = 0; i < SD_LV_MAX; i++) { + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(d->sdd[i].sd, j)); + kfree(*per_cpu_ptr(d->sdd[i].sg, j)); + } + free_percpu(d->sdd[i].sd); + free_percpu(d->sdd[i].sg); + } /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ case sa_nodemask: @@ -7122,25 +7047,70 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { + int i, j; + + memset(d, 0, sizeof(*d)); + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) return sa_nodemask; - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) - return sa_send_covered; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) { - printk(KERN_WARNING "Cannot alloc per-cpu pointers\n"); - return sa_tmpmask; + for (i = 0; i < SD_LV_MAX; i++) { + d->sdd[i].sd = alloc_percpu(struct sched_domain *); + if (!d->sdd[i].sd) + return sa_sd_storage; + + d->sdd[i].sg = alloc_percpu(struct sched_group *); + if (!d->sdd[i].sg) + return sa_sd_storage; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return sa_sd_storage; + + *per_cpu_ptr(d->sdd[i].sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return sa_sd_storage; + + *per_cpu_ptr(d->sdd[i].sg, j) = sg; + } } + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; d->rd = alloc_rootdomain(); - if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + if (!d->rd) return sa_sd; - } return sa_rootdomain; } +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + struct sched_group *sg = sd->groups; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; + + if (cpu == cpumask_first(sched_group_cpus(sg))) { + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + *per_cpu_ptr(sdd->sg, cpu) = NULL; + } +} + static struct sched_domain *__build_numa_sched_domains(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) { @@ -7151,24 +7121,20 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, d->sd_allnodes = 0; if (cpumask_weight(cpu_map) > SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); + sd = sd_init_ALLNODES(d, i); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); d->sd_allnodes = 1; } parent = sd; - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); + sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = parent; if (parent) parent->child = sd; cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); - cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7178,14 +7144,12 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *parent, int i) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); + sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), d->nodemask); sd->parent = parent; if (parent) parent->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); return sd; } @@ -7195,13 +7159,11 @@ static struct sched_domain *__build_book_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_BOOK - sd = &per_cpu(book_domains, i).sd; - SD_INIT(sd, BOOK); + sd = sd_init_BOOK(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); sd->parent = parent; parent->child = sd; - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7212,13 +7174,11 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_MC - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); + sd = sd_init_MC(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); sd->parent = parent; parent->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7229,92 +7189,32 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); + sd = sd_init_SIBLING(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); sd->parent = parent; parent->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } -static void build_sched_groups(struct s_data *d, struct sched_domain *sd, - const struct cpumask *cpu_map, int cpu) -{ - switch (sd->level) { -#ifdef CONFIG_SCHED_SMT - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_cpu_group, - d->send_covered, d->tmpmask); - break; -#endif -#ifdef CONFIG_SCHED_MC - case SD_LV_MC: /* set up multi-core groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_core_group, - d->send_covered, d->tmpmask); - break; -#endif -#ifdef CONFIG_SCHED_BOOK - case SD_LV_BOOK: /* set up book groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_book_group, - d->send_covered, d->tmpmask); - break; -#endif - case SD_LV_CPU: /* set up physical groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_phys_group, - d->send_covered, d->tmpmask); - break; -#ifdef CONFIG_NUMA - case SD_LV_NODE: - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_node_group, - d->send_covered, d->tmpmask); - - case SD_LV_ALLNODES: - if (cpu == cpumask_first(cpu_map)) - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - d->send_covered, d->tmpmask); - break; -#endif - default: - break; - } -} - /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; + struct sched_domain *sd; struct s_data d; - struct sched_domain *sd, *tmp; int i; -#ifdef CONFIG_NUMA - d.sd_allnodes = 0; -#endif alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - /* - * Set up domains for cpus specified by the cpu_map. - */ + /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -7326,10 +7226,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; + } + + /* Build the groups for the domains */ + for_each_cpu(i, cpu_map) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + get_group(i, sd->private, &sd->groups); + atomic_inc(&sd->groups->ref); - for (tmp = sd; tmp; tmp = tmp->parent) { - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - build_sched_groups(&d, tmp, cpu_map, i); + if (i != cpumask_first(sched_domain_span(sd))) + continue; + + build_sched_groups(sd, d.send_covered); } } @@ -7338,18 +7247,21 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (!cpumask_test_cpu(i, cpu_map)) continue; - sd = *per_cpu_ptr(d.sd, i); - for (; sd; sd = sd->parent) + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); init_sched_groups_power(i, sd); + } } /* Attach the domains */ + rcu_read_lock(); for_each_cpu(i, cpu_map) { sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } + rcu_read_unlock(); - __free_domain_allocs(&d, sa_tmpmask, cpu_map); + __free_domain_allocs(&d, sa_sd, cpu_map); return 0; error: @@ -7357,11 +7269,6 @@ error: return -ENOMEM; } -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); -} - static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; @@ -7425,31 +7332,24 @@ static int init_sched_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur[0]); + err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); return err; } -static void destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ -} - /* * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ static void detach_destroy_domains(const struct cpumask *cpu_map) { - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + rcu_read_unlock(); } /* handle null as "default" */ @@ -7538,8 +7438,7 @@ match1: goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new[i], - dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } -- cgit v1.2.3 From 822ff793c34a5d4c8b5f3f9ce932602233d96464 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:51 +0200 Subject: sched: Simplify the free path some If we check the root_domain reference count we can see if its been used or not, use this observation to simplify some of the return paths. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.298339503@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 65204845063e..72c194c55c31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7023,7 +7023,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, switch (what) { case sa_rootdomain: - free_rootdomain(&d->rd->rcu); /* fall through */ + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ case sa_sd: free_percpu(d->sd); /* fall through */ case sa_sd_storage: @@ -7208,7 +7209,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; - int i; + int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) @@ -7261,12 +7262,10 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); - __free_domain_allocs(&d, sa_sd, cpu_map); - return 0; - + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); - return -ENOMEM; + return ret; } static cpumask_var_t *doms_cur; /* current sched domains */ -- cgit v1.2.3 From 3bd65a80affb9768b91f03c56dba46ee79525f9b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:54 +0200 Subject: sched: Simplify NODE/ALLNODES domain creation Don't treat ALLNODES/NODE different for difference's sake. Simply always create the ALLNODES domain and let the sd_degenerate() checks kill it when its redundant. This simplifies the code flow. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.455464579@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 72c194c55c31..d395fe5493c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6838,9 +6838,6 @@ struct sd_data { }; struct s_data { -#ifdef CONFIG_NUMA - int sd_allnodes; -#endif cpumask_var_t nodemask; cpumask_var_t send_covered; struct sched_domain ** __percpu sd; @@ -7112,30 +7109,35 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } } -static struct sched_domain *__build_numa_sched_domains(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) { struct sched_domain *sd = NULL; #ifdef CONFIG_NUMA - struct sched_domain *parent; - - d->sd_allnodes = 0; - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = sd_init_ALLNODES(d, i); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - d->sd_allnodes = 1; - } - parent = sd; + sd = sd_init_ALLNODES(d, i); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + sd->parent = parent; + if (parent) + parent->child = sd; +#endif + return sd; +} +static struct sched_domain *__build_node_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); sd->parent = parent; if (parent) parent->child = sd; - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); #endif return sd; } @@ -7220,7 +7222,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); + sd = NULL; + sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); -- cgit v1.2.3 From bf28b253266ebd73c331dde24d64606afde32ceb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:55 +0200 Subject: sched: Remove nodemask allocation There's only one nodemask user left so remove it with a direct computation and save some memory and reduce some code-flow complexity. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.505608966@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d395fe5493c9..f4d3a624c50a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6838,7 +6838,6 @@ struct sd_data { }; struct s_data { - cpumask_var_t nodemask; cpumask_var_t send_covered; struct sched_domain ** __percpu sd; struct sd_data sdd[SD_LV_MAX]; @@ -6850,7 +6849,6 @@ enum s_alloc { sa_sd, sa_sd_storage, sa_send_covered, - sa_nodemask, sa_none, }; @@ -7035,8 +7033,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, } /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ - case sa_nodemask: - free_cpumask_var(d->nodemask); /* fall through */ case sa_none: break; } @@ -7049,10 +7045,8 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, memset(d, 0, sizeof(*d)); - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_nodemask; + return sa_none; for (i = 0; i < SD_LV_MAX; i++) { d->sdd[i].sd = alloc_percpu(struct sched_domain *); if (!d->sdd[i].sd) @@ -7149,7 +7143,8 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *sd; sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d->nodemask); + cpumask_and(sched_domain_span(sd), + cpumask_of_node(cpu_to_node(i)), cpu_map); sd->parent = parent; if (parent) parent->child = sd; @@ -7219,9 +7214,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), - cpu_map); - sd = NULL; sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); -- cgit v1.2.3 From 7dd04b730749f957c116f363524fd622b05e5141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:56 +0200 Subject: sched: Remove some dead code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.553814623@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f4d3a624c50a..5ec685ce516a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6816,22 +6816,6 @@ static void sched_domain_node_span(int node, struct cpumask *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; - -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); -}; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; -- cgit v1.2.3 From f96225fd51893b6650cffd5427f13f6b1b356488 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:57 +0200 Subject: sched: Create persistent sched_domains_tmpmask Since sched domain creation is fully serialized by the sched_domains_mutex we can create a single persistent tmpmask to use during domain creation. This removes the need for s_data::send_covered. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.607287405@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5ec685ce516a..fd73e91be089 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6822,7 +6822,6 @@ struct sd_data { }; struct s_data { - cpumask_var_t send_covered; struct sched_domain ** __percpu sd; struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; @@ -6832,7 +6831,6 @@ enum s_alloc { sa_rootdomain, sa_sd, sa_sd_storage, - sa_send_covered, sa_none, }; @@ -6853,6 +6851,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) return cpu; } +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + /* * build_sched_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU @@ -6864,13 +6864,17 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) * and ->cpu_power to 0. */ static void -build_sched_groups(struct sched_domain *sd, struct cpumask *covered) +build_sched_groups(struct sched_domain *sd) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; int i; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + cpumask_clear(covered); for_each_cpu(i, span) { @@ -7015,8 +7019,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sdd[i].sd); free_percpu(d->sdd[i].sg); } /* fall through */ - case sa_send_covered: - free_cpumask_var(d->send_covered); /* fall through */ case sa_none: break; } @@ -7029,8 +7031,6 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, memset(d, 0, sizeof(*d)); - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_none; for (i = 0; i < SD_LV_MAX; i++) { d->sdd[i].sd = alloc_percpu(struct sched_domain *); if (!d->sdd[i].sd) @@ -7219,7 +7219,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, if (i != cpumask_first(sched_domain_span(sd))) continue; - build_sched_groups(sd, d.send_covered); + build_sched_groups(sd); } } @@ -7896,6 +7896,7 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); -- cgit v1.2.3 From 4cb988395da6e16627a8be69729e50cd72ebb23e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:58 +0200 Subject: sched: Avoid allocations in sched_domain_debug() Since we're all serialized by sched_domains_mutex we can use sched_domains_tmpmask and avoid having to do allocations. This means we can use sched_domains_debug() for cpu_attach_domain() again. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.664347467@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fd73e91be089..35fc9959b564 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6395,6 +6395,8 @@ early_initcall(migration_init); #ifdef CONFIG_SMP +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_domain_debug_enabled; @@ -6490,7 +6492,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_var_t groupmask; int level = 0; if (!sched_domain_debug_enabled) @@ -6503,20 +6504,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } - free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6721,7 +6716,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } - /* sched_domain_debug(sd, cpu); */ + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); tmp = rq->sd; @@ -6851,8 +6846,6 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) return cpu; } -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - /* * build_sched_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU @@ -7896,8 +7889,8 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); -- cgit v1.2.3 From d3081f52f29da1ba6c27685519a9222b39eac763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:59 +0200 Subject: sched: Create proper cpu_$DOM_mask() functions In order to unify the sched domain creation more, create proper cpu_$DOM_mask() functions for those domains that didn't already have one. Use the sched_domains_tmpmask for the weird NUMA domain span. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.717702108@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 35fc9959b564..3ae1e023f3f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6807,8 +6807,22 @@ static void sched_domain_node_span(int node, struct cpumask *span) cpumask_or(span, span, cpumask_of_node(next_node)); } } + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} #endif /* CONFIG_NUMA */ +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { @@ -7088,7 +7102,7 @@ static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, #ifdef CONFIG_NUMA sd = sd_init_ALLNODES(d, i); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_possible_mask); sd->parent = parent; if (parent) parent->child = sd; @@ -7104,8 +7118,7 @@ static struct sched_domain *__build_node_sched_domain(struct s_data *d, #ifdef CONFIG_NUMA sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_node_mask(i)); sd->parent = parent; if (parent) parent->child = sd; @@ -7120,8 +7133,7 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *sd; sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - cpumask_of_node(cpu_to_node(i)), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_cpu_mask(i)); sd->parent = parent; if (parent) parent->child = sd; -- cgit v1.2.3 From eb7a74e6cd936c00749e2921b9e058631d986648 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:00 +0200 Subject: sched: Stuff the sched_domain creation in a data-structure In order to make the topology contruction fully dynamic, remove the still hard-coded list of possible domains and stick them in a data-structure. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.770335383@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3ae1e023f3f0..f0e1821dcb96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6843,6 +6843,16 @@ enum s_alloc { sa_none, }; +typedef struct sched_domain *(*sched_domain_build_f)(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int cpu); + +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +struct sched_domain_topology_level { + sched_domain_build_f build; +}; + /* * Assumes the sched_domain tree is fully constructed */ @@ -7185,6 +7195,18 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } +static struct sched_domain_topology_level default_topology[] = { + { __build_allnodes_sched_domain, }, + { __build_node_sched_domain, }, + { __build_cpu_sched_domain, }, + { __build_book_sched_domain, }, + { __build_mc_sched_domain, }, + { __build_smt_sched_domain, }, + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -7203,13 +7225,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + sd = NULL; - sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + for (tl = sched_domain_topology; tl->build; tl++) + sd = tl->build(&d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.2.3 From 2c402dc3bb502e9dd74fce72c14d293fcef4719d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:01 +0200 Subject: sched: Unify the sched_domain build functions Since all the __build_$DOM_sched_domain() functions do pretty much the same thing, unify them. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.826347257@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 133 +++++++++++++++++---------------------------------------- 1 file changed, 39 insertions(+), 94 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f0e1821dcb96..00d1e37b4596 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6816,6 +6816,11 @@ static const struct cpumask *cpu_node_mask(int cpu) return sched_domains_tmpmask; } + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} #endif /* CONFIG_NUMA */ static const struct cpumask *cpu_cpu_mask(int cpu) @@ -6843,14 +6848,12 @@ enum s_alloc { sa_none, }; -typedef struct sched_domain *(*sched_domain_build_f)(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int cpu); - +typedef struct sched_domain *(*sched_domain_init_f)(struct s_data *d, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); struct sched_domain_topology_level { - sched_domain_build_f build; + sched_domain_init_f init; + sched_domain_mask_f mask; }; /* @@ -7104,109 +7107,51 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } } -static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) { - struct sched_domain *sd = NULL; -#ifdef CONFIG_NUMA - sd = sd_init_ALLNODES(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_possible_mask); - sd->parent = parent; - if (parent) - parent->child = sd; -#endif - return sd; + return topology_thread_cpumask(cpu); } +#endif -static struct sched_domain *__build_node_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = NULL; +static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_NUMA - sd = sd_init_NODE(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_node_mask(i)); - sd->parent = parent; - if (parent) - parent->child = sd; + { sd_init_ALLNODES, cpu_allnodes_mask, }, + { sd_init_NODE, cpu_node_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd; - sd = sd_init_CPU(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_cpu_mask(i)); - sd->parent = parent; - if (parent) - parent->child = sd; - return sd; -} - -static struct sched_domain *__build_book_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; + { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_SCHED_BOOK - sd = sd_init_BOOK(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_BOOK, cpu_book_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_mc_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_MC - sd = sd_init_MC(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_MC, cpu_coregroup_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_smt_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT - sd = sd_init_SIBLING(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_SIBLING, cpu_smt_mask, }, #endif - return sd; -} - -static struct sched_domain_topology_level default_topology[] = { - { __build_allnodes_sched_domain, }, - { __build_node_sched_domain, }, - { __build_cpu_sched_domain, }, - { __build_book_sched_domain, }, - { __build_mc_sched_domain, }, - { __build_smt_sched_domain, }, { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *parent, + int cpu) +{ + struct sched_domain *sd = tl->init(d, cpu); + if (!sd) + return parent; + + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd->parent = parent; + if (parent) + parent->child = sd; + + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -7228,8 +7173,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->build; tl++) - sd = tl->build(&d, cpu_map, attr, sd, i); + for (tl = sched_domain_topology; tl->init; tl++) + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.2.3 From d069b916f7b50021d41d6ce498f86da32a7afaec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:02 +0200 Subject: sched: Reverse the topology list In order to get rid of static sched_domain::level assignments, reverse the topology iteration. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.876506131@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 00d1e37b4596..38bc53b576a7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7114,20 +7114,23 @@ static const struct cpumask *cpu_smt_mask(int cpu) } #endif +/* + * Topology list, bottom-up. + */ static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_NUMA - { sd_init_ALLNODES, cpu_allnodes_mask, }, - { sd_init_NODE, cpu_node_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, +#ifdef CONFIG_SCHED_SMT + { sd_init_SIBLING, cpu_smt_mask, }, #endif #ifdef CONFIG_SCHED_MC { sd_init_MC, cpu_coregroup_mask, }, #endif -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, +#ifdef CONFIG_SCHED_BOOK + { sd_init_BOOK, cpu_book_mask, }, +#endif + { sd_init_CPU, cpu_cpu_mask, }, +#ifdef CONFIG_NUMA + { sd_init_NODE, cpu_node_mask, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, }; @@ -7136,18 +7139,18 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *parent, + struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { struct sched_domain *sd = tl->init(d, cpu); if (!sd) - return parent; + return child; set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd->parent = parent; - if (parent) - parent->child = sd; + if (child) + child->parent = sd; + sd->child = child; return sd; } @@ -7176,6 +7179,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, for (tl = sched_domain_topology; tl->init; tl++) sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + while (sd->child) + sd = sd->child; + *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.2.3 From 54ab4ff4316eb329d2c1acc110fbc623d2966931 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:03 +0200 Subject: sched: Move sched domain storage into the topology list In order to remove the last dependency on the statid domain levels, move the sd_data storage into the topology structure. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.924926412@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 52 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 38bc53b576a7..3231e1997426 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6837,7 +6837,6 @@ struct sd_data { struct s_data { struct sched_domain ** __percpu sd; - struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; }; @@ -6848,12 +6847,15 @@ enum s_alloc { sa_none, }; -typedef struct sched_domain *(*sched_domain_init_f)(struct s_data *d, int cpu); +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + struct sd_data data; }; /* @@ -6958,15 +6960,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain *sd_init_##type(struct s_data *d, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(d->sdd[SD_LV_##type].sd, cpu); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ - sd->private = &d->sdd[SD_LV_##type]; \ - return sd; \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7019,11 +7022,12 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { - int i, j; - switch (what) { case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) @@ -7031,14 +7035,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_sd: free_percpu(d->sd); /* fall through */ case sa_sd_storage: - for (i = 0; i < SD_LV_MAX; i++) { - for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(d->sdd[i].sd, j)); - kfree(*per_cpu_ptr(d->sdd[i].sg, j)); - } - free_percpu(d->sdd[i].sd); - free_percpu(d->sdd[i].sg); - } /* fall through */ + __sdt_free(cpu_map); /* fall through */ case sa_none: break; } @@ -7047,38 +7044,10 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { - int i, j; - memset(d, 0, sizeof(*d)); - for (i = 0; i < SD_LV_MAX; i++) { - d->sdd[i].sd = alloc_percpu(struct sched_domain *); - if (!d->sdd[i].sd) - return sa_sd_storage; - - d->sdd[i].sg = alloc_percpu(struct sched_group *); - if (!d->sdd[i].sg) - return sa_sd_storage; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - struct sched_group *sg; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return sa_sd_storage; - - *per_cpu_ptr(d->sdd[i].sd, j) = sd; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sg) - return sa_sd_storage; - - *per_cpu_ptr(d->sdd[i].sg, j) = sg; - } - } + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) return sa_sd_storage; @@ -7137,12 +7106,68 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); + } +} + struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(d, cpu); + struct sched_domain *sd = tl->init(tl, cpu); if (!sd) return child; -- cgit v1.2.3 From 60495e7760d8ee364695006af37309b0755e0e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:04 +0200 Subject: sched: Dynamic sched_domain::level Remove the SD_LV_ enum and use dynamic level assignments. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.969433965@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3231e1997426..506cb8147c70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6966,7 +6966,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ { \ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ SD_INIT_NAME(sd, type); \ sd->private = &tl->data; \ return sd; \ @@ -6988,13 +6987,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7173,8 +7173,11 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + } sd->child = child; return sd; -- cgit v1.2.3 From 3ca7a440da394808571dad32d33d3bc0389982e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:40 +0200 Subject: sched: Always provide p->on_cpu Always provide p->on_cpu so that we can determine if its on a cpu without having to lock the rq. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.785452014@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027b..cd2593e1a3ec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -838,18 +838,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +886,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +894,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +907,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -2686,8 +2698,8 @@ void sched_fork(struct task_struct *p, int clone_flags) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -5776,8 +5788,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3 From c6eb3dda25892f1f974f5420f63e6721aab02f6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:41 +0200 Subject: mutex: Use p->on_cpu for the adaptive spin Since we now have p->on_cpu unconditionally available, use it to re-implement mutex_spin_on_owner. Requested-by: Thomas Gleixner Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.826338173@chello.nl --- kernel/sched.c | 83 +++++++++++++++++++++++----------------------------------- 1 file changed, 33 insertions(+), 50 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cd2593e1a3ec..55cc50323ce1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4173,70 +4173,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif -- cgit v1.2.3 From c2f7115e2e52a6c187b8c1f54f0e4970bb677be0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Apr 2011 13:28:56 +0200 Subject: sched: Move wq_worker_waking to the correct site wq_worker_waking_up() needs to match wq_worker_sleeping(), since the latter is only called on deactivate, move the former near activate. Signed-off-by: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/n/top-t3m7n70n9frmv4pv2n5fwmov@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 55cc50323ce1..81ab58efd788 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2421,6 +2421,10 @@ static inline void ttwu_activate(struct task_struct *p, struct rq *rq, schedstat_inc(p, se.statistics.nr_wakeups_remote); activate_task(rq, p, en_flags); + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, @@ -2445,9 +2449,6 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif - /* if a worker is waking up, notify workqueue */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); } /** -- cgit v1.2.3 From 893633817f5b58f5227365d74344e0170a718213 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:42 +0200 Subject: sched: Change the ttwu() success details try_to_wake_up() would only return a success when it would have to place a task on a rq, change that to every time we change p->state to TASK_RUNNING, because that's the real measure of wakeups. This results in that success is always true for the tracepoints. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.866866929@chello.nl --- kernel/sched.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 81ab58efd788..3919aa419356 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2427,10 +2427,10 @@ static inline void ttwu_activate(struct task_struct *p, struct rq *rq, wq_worker_waking_up(p, cpu_of(rq)); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - int wake_flags, bool success) +static void +ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2546,9 +2546,9 @@ out_activate: #endif /* CONFIG_SMP */ ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, cpu == this_cpu, en_flags); - success = 1; out_running: - ttwu_post_activation(p, rq, wake_flags, success); + ttwu_post_activation(p, rq, wake_flags); + success = 1; out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2567,7 +2567,6 @@ out: static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; BUG_ON(rq != this_rq()); BUG_ON(p == current); @@ -2582,9 +2581,8 @@ static void try_to_wake_up_local(struct task_struct *p) schedstat_inc(rq, ttwu_local); } ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - success = true; } - ttwu_post_activation(p, rq, 0, success); + ttwu_post_activation(p, rq, 0); } /** @@ -2747,7 +2745,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); - trace_sched_wakeup_new(p, 1); + trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) -- cgit v1.2.3 From d7c01d27ab767a30d672d1fd657aa8336ebdcbca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:43 +0200 Subject: sched: Clean up ttwu() stats Collect all ttwu() stat code into a single function and ensure its always called for an actual wakeup (changing p->state to TASK_RUNNING). Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.908177058@chello.nl --- kernel/sched.c | 75 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3919aa419356..4481638f9178 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2406,20 +2406,43 @@ static void update_avg(u64 *avg, u64 sample) } #endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync, bool is_migrate, bool is_local, - unsigned long en_flags) +static void +ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) { +#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); - if (is_sync) + + if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (is_migrate) + + if (cpu != task_cpu(p)) schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (is_local) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ activate_task(rq, p, en_flags); /* if a worker is waking up, notify workqueue */ @@ -2481,12 +2504,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (!(p->state & state)) goto out; + cpu = task_cpu(p); + if (p->se.on_rq) goto out_running; - cpu = task_cpu(p); orig_cpu = cpu; - #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; @@ -2527,27 +2550,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ - out_activate: #endif /* CONFIG_SMP */ - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, - cpu == this_cpu, en_flags); + ttwu_activate(rq, p, en_flags); out_running: ttwu_post_activation(p, rq, wake_flags); + ttwu_stat(rq, p, cpu, wake_flags); success = 1; out: task_rq_unlock(rq, &flags); @@ -2575,14 +2583,11 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) { - if (likely(!task_running(rq, p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - } + if (!p->se.on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_post_activation(p, rq, 0); + ttwu_stat(rq, p, smp_processor_id(), 0); } /** -- cgit v1.2.3 From fd2f4419b4cbe8fe90796df9617c355762afd6a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:44 +0200 Subject: sched: Provide p->on_rq Provide a generic p->on_rq because the p->se.on_rq semantics are unfavourable for lockless wakeups but needed for sched_fair. In particular, p->on_rq is only cleared when we actually dequeue the task in schedule() and not on any random dequeue as done by things like __migrate_task() and __sched_setscheduler(). This also allows us to remove p->se usage from !sched_fair code. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.949545047@chello.nl --- kernel/sched.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4481638f9178..dece28e505c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1785,7 +1785,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1793,7 +1792,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2128,7 +2126,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2203,7 +2201,7 @@ static bool migrate_task(struct task_struct *p, struct rq *rq) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->se.on_rq || task_running(rq, p); + return p->on_rq || task_running(rq, p); } /* @@ -2263,7 +2261,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -2444,6 +2442,7 @@ ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); + p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -2506,7 +2505,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, cpu = task_cpu(p); - if (p->se.on_rq) + if (p->on_rq) goto out_running; orig_cpu = cpu; @@ -2583,7 +2582,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) + if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); @@ -2620,19 +2619,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2750,6 +2751,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); + p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -4051,7 +4053,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4126,7 +4128,9 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; /* * If we are going to sleep and we have plugged IO queued, make @@ -4695,7 +4699,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4743,7 +4747,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4877,8 +4881,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -5044,7 +5046,7 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5965,7 +5967,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -8339,7 +8341,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8682,7 +8684,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); -- cgit v1.2.3 From 013fdb8086acaae5f8eb96f9ad48fcd98882ac46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:45 +0200 Subject: sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Currently p->pi_lock already serializes p->sched_class, also put p->cpus_allowed and try_to_wake_up() under it, this prepares the way to do the first part of ttwu() without holding rq->lock. By having p->sched_class and p->cpus_allowed serialized by p->pi_lock, we prepare the way to call select_task_rq() without holding rq->lock. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.990364093@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dece28e505c9..d398f2f0a3c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2340,7 +2340,7 @@ EXPORT_SYMBOL_GPL(kick_process); #ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -2373,7 +2373,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) @@ -2499,7 +2499,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); if (!(p->state & state)) goto out; @@ -2557,7 +2558,8 @@ out_running: ttwu_stat(rq, p, cpu, wake_flags); success = 1; out: - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); return success; @@ -4694,6 +4696,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio < 0 || prio > MAX_PRIO); + lockdep_assert_held(&p->pi_lock); + rq = task_rq_lock(p, &flags); trace_sched_pi_setprio(p, prio); @@ -5317,7 +5321,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; - struct rq *rq; int retval; get_online_cpus(); @@ -5332,9 +5335,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -5882,18 +5885,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - /* - * Serialize against TASK_WAKING so that ttwu() and wunt() can - * drop the rq->lock and still rely on ->cpus_allowed. - */ -again: - while (task_is_waking(p)) - cpu_relax(); - rq = task_rq_lock(p, &flags); - if (task_is_waking(p)) { - task_rq_unlock(rq, &flags); - goto again; - } + raw_spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5921,13 +5914,15 @@ again: if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return ret; } -- cgit v1.2.3 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d398f2f0a3c9..d4b815d345b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2195,13 +2195,15 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, struct rq *rq) +static bool need_migrate_task(struct task_struct *p) { /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->on_rq || task_running(rq, p); + bool running = p->on_rq || p->on_cpu; + smp_rmb(); /* finish_lock_switch() */ + return running; } /* @@ -2376,9 +2378,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2533,7 +2535,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, en_flags |= ENQUEUE_WAKING; } - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); __task_rq_unlock(rq); @@ -2744,7 +2746,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * We set TASK_WAKING so that select_task_rq() can drop rq->lock * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); set_task_cpu(p, cpu); p->state = TASK_RUNNING; @@ -3474,7 +3476,7 @@ void sched_exec(void) int dest_cpu; rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3482,7 +3484,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -5911,7 +5913,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ __task_rq_unlock(rq); -- cgit v1.2.3 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d4b815d345b3..46f42cac4eb1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2531,7 +2531,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); + p->sched_class->task_waking(p); en_flags |= ENQUEUE_WAKING; } -- cgit v1.2.3 From 3fe1698b7fe05aeb063564e71e40d09f28d8e80c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:48 +0200 Subject: sched: Deal with non-atomic min_vruntime reads on 32bits In order to avoid reading partial updated min_vruntime values on 32bit implement a seqcount like solution. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.111378493@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 46f42cac4eb1..7a5eb2620785 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,6 +312,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; -- cgit v1.2.3 From a8e4f2eaecc9bfa4954adf79a04f4f22fddd829c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:49 +0200 Subject: sched: Delay task_contributes_to_load() In prepratation of having to call task_contributes_to_load() without holding rq->lock, we need to store the result until we do and can update the rq accounting accordingly. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.151523907@chello.nl --- kernel/sched.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7a5eb2620785..fd32b78c123c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2519,18 +2519,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (unlikely(task_running(rq, p))) goto out_activate; - /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: - */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) { @@ -2555,6 +2544,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + out_activate: #endif /* CONFIG_SMP */ ttwu_activate(rq, p, en_flags); -- cgit v1.2.3 From 2acca55ed98ad9b9aa25e7e587ebe306c0313dc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:50 +0200 Subject: sched: Also serialize ttwu_local() with p->pi_lock Since we now serialize ttwu() using p->pi_lock, we also need to serialize ttwu_local() using that, otherwise, once we drop the rq->lock from ttwu() it can race with ttwu_local(). Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.192366907@chello.nl --- kernel/sched.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fd32b78c123c..6b269b79c52c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2566,9 +2566,9 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not already there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. this_rq() stays locked over invocation. + * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { @@ -2578,14 +2578,22 @@ static void try_to_wake_up_local(struct task_struct *p) BUG_ON(p == current); lockdep_assert_held(&rq->lock); + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + if (!(p->state & TASK_NORMAL)) - return; + goto out; if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); ttwu_stat(rq, p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -4114,11 +4122,13 @@ need_resched: if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; @@ -4128,12 +4138,9 @@ need_resched: try_to_wake_up_local(to_wakeup); } - deactivate_task(rq, prev, DEQUEUE_SLEEP); - prev->on_rq = 0; - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. + * If we are going to sleep and we have plugged IO + * queued, make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); -- cgit v1.2.3 From 0122ec5b02f766c355b3168df53a6c038a24fa0d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:51 +0200 Subject: sched: Add p->pi_lock to task_rq_lock() In order to be able to call set_task_cpu() while either holding p->pi_lock or task_rq(p)->lock we need to hold both locks in order to stabilize task_rq(). This makes task_rq_lock() acquire both locks, and have __task_rq_lock() validate that p->pi_lock is held. This increases the locking overhead for most scheduler syscalls but allows reduction of rq->lock contention for some scheduler hot paths (ttwu). Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.232781355@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 103 ++++++++++++++++++++++++++------------------------------- 1 file changed, 47 insertions(+), 56 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6b269b79c52c..f1551271a685 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -599,7 +599,7 @@ static inline int cpu_of(struct rq *rq) * Return the group to which this tasks belongs. * * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() * holds that lock for each task it moves into the cgroup. Therefore * by holding that lock, we pin the task to the current cgroup. */ @@ -609,7 +609,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&task_rq(p)->lock)); + lockdep_is_held(&p->pi_lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -924,23 +924,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize ->cpus_allowed - * against ttwu(). - */ -static inline int task_is_waking(struct task_struct *p) -{ - return unlikely(p->state == TASK_WAKING); -} - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. + * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; + lockdep_assert_held(&p->pi_lock); + for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); @@ -951,22 +943,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) } /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { - local_irq_save(*flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } @@ -976,10 +968,13 @@ static void __task_rq_unlock(struct rq *rq) raw_spin_unlock(&rq->lock); } -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) + __releases(p->pi_lock) { - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* @@ -2175,6 +2170,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif #endif trace_sched_migrate_task(p, new_cpu); @@ -2270,7 +2270,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -2652,6 +2652,7 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { + unsigned long flags; int cpu = get_cpu(); __sched_fork(p); @@ -2702,9 +2703,9 @@ void sched_fork(struct task_struct *p, int clone_flags) * * Silence PROVE_RCU. */ - rcu_read_lock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); - rcu_read_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2753,7 +2754,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) set_task_cpu(p, cpu); p->state = TASK_RUNNING; - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); #endif rq = task_rq_lock(p, &flags); @@ -2765,7 +2766,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); put_cpu(); } @@ -3490,12 +3491,12 @@ void sched_exec(void) likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } #endif @@ -3532,7 +3533,7 @@ unsigned long long task_delta_exec(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3550,7 +3551,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3574,7 +3575,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); thread_group_cputime(p, &totals); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -4693,16 +4694,13 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); - lockdep_assert_held(&p->pi_lock); - - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); trace_sched_pi_setprio(p, prio); oldprio = p->prio; @@ -4727,7 +4725,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); } #endif @@ -4775,7 +4773,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -5003,20 +5001,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* + * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -5040,8 +5035,7 @@ recheck: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EPERM; } } @@ -5050,8 +5044,7 @@ recheck: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } on_rq = p->on_rq; @@ -5073,8 +5066,7 @@ recheck: activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); @@ -5666,7 +5658,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5889,8 +5881,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - raw_spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5918,15 +5909,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return ret; } @@ -5954,6 +5943,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); + raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) @@ -5976,6 +5966,7 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); return ret; } @@ -8702,7 +8693,7 @@ void sched_move_task(struct task_struct *tsk) if (on_rq) enqueue_task(rq, tsk, 0); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, tsk, &flags); } #endif /* CONFIG_CGROUP_SCHED */ -- cgit v1.2.3 From ab2515c4b98f7bc4fa11cad9fa0f811d63a72a26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:52 +0200 Subject: sched: Drop rq->lock from first part of wake_up_new_task() Since p->pi_lock now protects all things needed to call select_task_rq() avoid the double remote rq->lock acquisition and rely on p->pi_lock. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.273362517@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f1551271a685..7c5494dccd39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2736,28 +2736,18 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP - rq = task_rq_lock(p, &flags); - p->state = TASK_WAKING; - /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug - * - * We set TASK_WAKING so that select_task_rq() can drop rq->lock - * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); - set_task_cpu(p, cpu); - - p->state = TASK_RUNNING; - task_rq_unlock(rq, p, &flags); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; trace_sched_wakeup_new(p, true); @@ -2767,7 +2757,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, p, &flags); - put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS -- cgit v1.2.3 From 8f42ced974df7d5af2de4cf5ea21fe978c7e4478 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:53 +0200 Subject: sched: Drop rq->lock from sched_exec() Since we can now call select_task_rq() and set_task_cpu() with only p->pi_lock held, and sched_exec() load-balancing has always been optimistic, drop all rq->lock usage. Oleg also noted that need_migrate_task() will always be true for current, so don't bother calling that at all. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.314204889@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7c5494dccd39..1be1a09b9dc9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3465,27 +3465,22 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - struct rq *rq; int dest_cpu; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; - /* - * select_task_rq() can race against ->cpus_allowed - */ - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { + if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, p, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif -- cgit v1.2.3 From e4a52bcb9a18142d79e231b6733cabdbf2e67c1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:54 +0200 Subject: sched: Remove rq->lock from the first half of ttwu() Currently ttwu() does two rq->lock acquisitions, once on the task's old rq, holding it over the p->state fiddling and load-balance pass. Then it drops the old rq->lock to acquire the new rq->lock. By having serialized ttwu(), p->sched_class, p->cpus_allowed with p->pi_lock, we can now drop the whole first rq->lock acquisition. The p->pi_lock serializing concurrent ttwu() calls protects p->state, which we will set to TASK_WAKING to bridge possible p->pi_lock to rq->lock gaps and serialize set_task_cpu() calls against task_rq_lock(). The p->pi_lock serialization of p->sched_class allows us to call scheduling class methods without holding the rq->lock, and the serialization of p->cpus_allowed allows us to do the load-balancing bits without races. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.354401150@chello.nl --- kernel/sched.c | 65 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 28 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1be1a09b9dc9..871dd9e147a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2493,69 +2493,78 @@ ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, orig_cpu, this_cpu, success = 0; + int cpu, this_cpu, success = 0; unsigned long flags; - unsigned long en_flags = ENQUEUE_WAKEUP; struct rq *rq; this_cpu = get_cpu(); smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); if (!(p->state & state)) goto out; cpu = task_cpu(p); - if (p->on_rq) - goto out_running; + if (p->on_rq) { + rq = __task_rq_lock(p); + if (p->on_rq) + goto out_running; + __task_rq_unlock(rq); + } - orig_cpu = cpu; #ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * If called from interrupt context we could have landed in the + * middle of schedule(), in this case we should take care not + * to spin on ->on_cpu if p is current, since that would + * deadlock. + */ + if (p == current) + goto out_activate; +#endif + cpu_relax(); + } + /* + * Pairs with the smp_wmb() in finish_lock_switch(). + */ + smp_rmb(); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) { + if (p->sched_class->task_waking) p->sched_class->task_waking(p); - en_flags |= ENQUEUE_WAKING; - } cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) - set_task_cpu(p, cpu); - __task_rq_unlock(rq); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +out_activate: +#endif +#endif /* CONFIG_SMP */ rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); - /* - * We migrated the task without holding either rq->lock, however - * since the task is not on the task list itself, nobody else - * will try and migrate the task, hence the rq should match the - * cpu we just moved it to. - */ - WARN_ON(task_cpu(p) != cpu); - WARN_ON(p->state != TASK_WAKING); +#ifdef CONFIG_SMP + if (cpu != task_cpu(p)) + set_task_cpu(p, cpu); if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#endif -out_activate: -#endif /* CONFIG_SMP */ - ttwu_activate(rq, p, en_flags); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: ttwu_post_activation(p, rq, wake_flags); ttwu_stat(rq, p, cpu, wake_flags); success = 1; -out: __task_rq_unlock(rq); +out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); -- cgit v1.2.3 From b84cb5df1f9ad6da3f214c638d5fb08d0c99de1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:55 +0200 Subject: sched: Remove rq argument from ttwu_stat() In order to call ttwu_stat() without holding rq->lock we must remove its rq argument. Since we need to change rq stats, account to the local rq instead of the task rq, this is safe since we have IRQs disabled. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.394638826@chello.nl --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 871dd9e147a6..5ec2e8b4b01a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2410,9 +2410,11 @@ static void update_avg(u64 *avg, u64 sample) #endif static void -ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { #ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + #ifdef CONFIG_SMP int this_cpu = smp_processor_id(); @@ -2561,9 +2563,10 @@ out_activate: ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: ttwu_post_activation(p, rq, wake_flags); - ttwu_stat(rq, p, cpu, wake_flags); success = 1; __task_rq_unlock(rq); + + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); @@ -2600,7 +2603,7 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); - ttwu_stat(rq, p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } -- cgit v1.2.3 From 23f41eeb42ce7f6f1210904e49e84718f02cb61c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:56 +0200 Subject: sched: Rename ttwu_post_activation() to ttwu_do_wakeup() The ttwu_post_activation() code does the core wakeup, it sets TASK_RUNNING and performs wakeup-preemption, so give is a more descriptive name. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.434609705@chello.nl --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5ec2e8b4b01a..e309dbad2038 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2456,8 +2456,11 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) wq_worker_waking_up(p, cpu_of(rq)); } +/* + * Mark the task runnable and perform wakeup-preemption. + */ static void -ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); @@ -2562,7 +2565,7 @@ out_activate: ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: - ttwu_post_activation(p, rq, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags); success = 1; __task_rq_unlock(rq); @@ -2602,7 +2605,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_post_activation(p, rq, 0); + ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); -- cgit v1.2.3 From c05fbafba1c5482bee399b360288fa405415e126 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:57 +0200 Subject: sched: Restructure ttwu() some more Factor our helper functions to make the inner workings of try_to_wake_up() more obvious, this also allows for adding remote queues. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.475848012@chello.nl --- kernel/sched.c | 91 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e309dbad2038..7d8b85fcdf06 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2483,6 +2483,48 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) #endif } +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); +} + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -2501,27 +2543,25 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; - - this_cpu = get_cpu(); + int cpu, success = 0; smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (p->on_rq) { - rq = __task_rq_lock(p); - if (p->on_rq) - goto out_running; - __task_rq_unlock(rq); - } + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; #ifdef CONFIG_SMP + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + */ while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* @@ -2530,8 +2570,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * to spin on ->on_cpu if p is current, since that would * deadlock. */ - if (p == current) - goto out_activate; + if (p == current) { + ttwu_queue(p, cpu); + goto stat; + } #endif cpu_relax(); } @@ -2547,32 +2589,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -out_activate: -#endif -#endif /* CONFIG_SMP */ - - rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - -#ifdef CONFIG_SMP - if (cpu != task_cpu(p)) + if (task_cpu(p) != cpu) set_task_cpu(p, cpu); +#endif /* CONFIG_SMP */ - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; -#endif - - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); -out_running: - ttwu_do_wakeup(rq, p, wake_flags); - success = 1; - __task_rq_unlock(rq); - + ttwu_queue(p, cpu); +stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); - put_cpu(); return success; } -- cgit v1.2.3 From bd8e7dded88a3e1c085c333f19ff31387616f71a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:59 +0200 Subject: sched: Remove need_migrate_task() Oleg noticed that need_migrate_task() doesn't need the ->on_cpu check now that ttwu() doesn't do remote enqueues for !->on_rq && ->on_cpu, so remove the helper and replace the single instance with a direct ->on_rq test. Suggested-by: Oleg Nesterov Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.556674812@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9e3ede120e81..cd597c7442a3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2198,21 +2198,6 @@ struct migration_arg { static int migration_cpu_stop(void *data); -/* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static bool need_migrate_task(struct task_struct *p) -{ - /* - * If the task is not on a runqueue (and not running), then - * the next wake-up will properly place the task. - */ - bool running = p->on_rq || p->on_cpu; - smp_rmb(); /* finish_lock_switch() */ - return running; -} - /* * wait_task_inactive - wait for a thread to unschedule. * @@ -5985,7 +5970,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (need_migrate_task(p)) { + if (p->on_rq) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); -- cgit v1.2.3 From 317f394160e9beb97d19a84c39b7e5eb3d7815a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:58 +0200 Subject: sched: Move the second half of ttwu() to the remote cpu Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i; done $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 647278 worker burns per second patched: run time 30 seconds 816715 worker burns per second Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.515897185@chello.nl --- kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7d8b85fcdf06..9e3ede120e81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -556,6 +556,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -2516,10 +2520,61 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) return ret; } +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); @@ -6331,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { -- cgit v1.2.3 From 057f3fadb347e9c51b07e1b277bbdda79f976768 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2011 11:24:34 +0200 Subject: sched: Fix sched_domain iterations vs. RCU Vladis Kletnieks reported a new RCU debug warning in the scheduler. Since commit dce840a08702b ("sched: Dynamically allocate sched_domain/ sched_group data-structures") the sched_domain trees are protected by RCU instead of RCU-sched. This means that we need to include rcu_read_lock() protection when we iterate them since disabling preemption doesn't suffice anymore. Reported-by: Valdis.Kletnieks@vt.edu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302882741.2388.241.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0cfe0310ed5d..27d3e73a2af6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1208,11 +1208,17 @@ int get_nohz_timer_target(void) int i; struct sched_domain *sd; + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) - if (!idle_cpu(i)) - return i; + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } } +unlock: + rcu_read_unlock(); return cpu; } /* @@ -2415,12 +2421,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) struct sched_domain *sd; schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); for_each_domain(this_cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } } + rcu_read_unlock(); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From d3bf52e998056a6002b2aecfe1d25486376382ac Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 20 Apr 2011 21:27:32 +0600 Subject: sched: Remove obsolete comment from scheduler_tick() scheduler_tick() is no longer called by fork code - this got discarded a long time ago by commit bc947631d1d532 ("sched: improve efficiency of sched_fork()"). So, remove the comment which still claims otherwise. Signed-off-by: Rakib Mullick Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTimO4iGP0QpaHO1HHF1QOnVcQpc0cw@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 30a29ad46eb1..8cb0a5769a16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4002,9 +4002,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(void) { -- cgit v1.2.3 From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 22 Apr 2011 11:19:10 -0600 Subject: sched: Get rid of lock_depth Neil Brown pointed out that lock_depth somehow escaped the BKL removal work. Let's get rid of it now. Note that the perf scripting utilities still have a bunch of code for dealing with common_lock_depth in tracepoints; I have left that in place in case anybody wants to use that code with older kernels. Suggested-by: Neil Brown Signed-off-by: Jonathan Corbet Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8cb0a5769a16..9cde2dd229c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4121,12 +4121,6 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), rq_sched_info.bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif } static void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -5852,11 +5846,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ -- cgit v1.2.3 From 1437f5bca3c2d162f058cba37dfbeb20f619040d Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sat, 23 Apr 2011 21:29:05 +0800 Subject: sched: Remove noop in alloc_rt_sched_group() The rq varible, though computed for each possible cpu, has nothing to do in the function, so it can be removed. This also eliminates a build warning. Signed-off-by: Hillf Danton Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTin-FfQfqW5ym1iuEmrk8s777Y1LAg@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9cde2dd229c9..f11a2a5d03aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8226,7 +8226,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; - struct rq *rq; int i; tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8240,8 +8239,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { - rq = cpu_rq(i); - rt_rq = kzalloc_node(sizeof(struct rt_rq), GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) -- cgit v1.2.3 From 4934a4d3d3fa775601a9f1b35cc0e2aa93f81355 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 4 May 2011 22:53:46 +0600 Subject: sched: Wrap the 'cfs_rq->nr_spread_over' field with CONFIG_SCHED_DEBUG cfs_rq->nr_spread_over is only used when CONFIG_SCHED_DEBUG is set. So wrap it with CONFIG_SCHED_DEBUG. Signed-off-by: Rakib Mullick Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1304528026.15681.3.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f11a2a5d03aa..3d8a1b2680ee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -328,7 +328,9 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last, *skip; +#ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.2.3 From 7142d17e8f935fa842e9f6eece2281b6d41625d6 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 5 May 2011 20:53:20 +0800 Subject: sched: Shorten the construction of the span cpu mask of sched domain For a given node, when constructing the cpumask for its sched_domain to span, if there is no best node available after searching, further efforts could be saved, based on small change in the return value of find_next_best_node(). Signed-off-by: Hillf Danton Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Yong Zhang Link: http://lkml.kernel.org/r/BANLkTi%3DqPWxRAa6%2BdT3ohEP6Z%3D0v%2Be4EXA@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3d8a1b2680ee..da9338150484 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6807,7 +6807,7 @@ __setup("isolcpus=", isolated_cpu_setup); */ static int find_next_best_node(int node, nodemask_t *used_nodes) { - int i, n, val, min_val, best_node = 0; + int i, n, val, min_val, best_node = -1; min_val = INT_MAX; @@ -6831,7 +6831,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) } } - node_set(best_node, *used_nodes); + if (best_node != -1) + node_set(best_node, *used_nodes); return best_node; } @@ -6857,7 +6858,8 @@ static void sched_domain_node_span(int node, struct cpumask *span) for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - + if (next_node < 0) + break; cpumask_or(span, span, cpumask_of_node(next_node)); } } -- cgit v1.2.3 From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001 From: Samir Bellabes Date: Wed, 11 May 2011 18:18:05 +0200 Subject: sched: Remove unused parameters from sched_fork() and wake_up_new_task() sched_fork() and wake_up_new_task() are defined with a parameter 'unsigned long clone_flags', which is unused. This patch removes the parameters. Signed-off-by: Samir Bellabes Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index da9338150484..f9778c0d91e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2741,7 +2741,7 @@ static void __sched_fork(struct task_struct *p) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p, int clone_flags) +void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -2823,7 +2823,7 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; -- cgit v1.2.3 From 61eadef6a9bde9ea62fda724a9cb501ce9bc925a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 29 Apr 2011 08:36:50 +0200 Subject: sched, rt: Update rq clock when unthrottling of an otherwise idle CPU If an RT task is awakened while it's rt_rq is throttled, the time between wakeup/enqueue and unthrottle/selection may be accounted as rt_time if the CPU is idle. Set rq->skip_clock_update negative upon throttle release to tell put_prev_task() that we need a clock update. Reported-by: Thomas Giesel Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1304059010.7472.1.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f9778c0d91e2..b8b9a7dac9b0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -466,7 +466,7 @@ struct rq { u64 nohz_stamp; unsigned char nohz_balance_kick; #endif - unsigned int skip_clock_update; + int skip_clock_update; /* capture load from *all* tasks on this cpu: */ struct load_weight load; @@ -652,7 +652,7 @@ static void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update) + if (rq->skip_clock_update > 0) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -4127,7 +4127,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->on_rq) + if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } -- cgit v1.2.3 From db44fc017d5989302713ab4e7f9e922b648f4b59 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 9 May 2011 22:07:05 +0800 Subject: sched: Avoid going ahead if ->cpus_allowed is not changed If cpumask_equal(&p->cpus_allowed, new_mask) is true, seems there is no reason to prevent set_cpus_allowed_ptr() return directly. Signed-off-by: Yong Zhang Acked-by: Hillf Danton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110509140705.GA2219@zhy Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b8b9a7dac9b0..70bec4f1edbb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5946,13 +5946,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpumask_equal(&p->cpus_allowed, new_mask))) { + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From db670dac49b5423b39b5e523d28fe32045d71b10 Mon Sep 17 00:00:00 2001 From: Stephan Baerwolf Date: Wed, 11 May 2011 18:03:29 +0200 Subject: sched: Fix and optimise calculation of the weight-inverse If the inverse loadweight should be zero, function "calc_delta_mine" calculates the inverse of "lw->weight" (in 32bit integer ops). This calculation is actually a little bit impure (because it is inverting something around "lw-weight"+1), especially when "lw->weight" becomes smaller. The correct inverse would be 1/lw->weight multiplied by "WMULT_CONST" for fixcomma-scaling it into integers. (So WMULT_CONST/lw->weight ...) The old, impure algorithm took two divisions for inverting lw->weight, the new, more exact one only takes one and an additional unlikely-if. Signed-off-by: Stephan Baerwolf Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-0pz0wnyalr4tk4ln11xwumdx@git.kernel.org [ This could explain some aritmetical issues for small shares but nothing concrete has been reported yet so we are not confident enough to queue this up in sched/urgent and for -stable backport. But if anyone finds this commit and sees it to fix some badness then we can certainly change our mind! ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 70bec4f1edbb..c62acf45d3b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1330,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; + tmp = (u64)delta_exec * weight; + if (!lw->inv_weight) { if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) lw->inv_weight = 1; else - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) - / (lw->weight+1); + lw->inv_weight = WMULT_CONST / lw->weight; } - tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ -- cgit v1.2.3 From f05998d4b80632f2cc00f108da503066ef5d38d5 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:38 -0700 Subject: sched: Cleanup set_load_weight() Avoid using long repetitious names; make this simpler and nicer to read. No functional change introduced in this patch. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-2-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c62acf45d3b9..d036048b59a4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1778,17 +1778,20 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + /* * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; + load->weight = WEIGHT_IDLEPRIO; + load->inv_weight = WMULT_IDLEPRIO; return; } - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; + load->weight = prio_to_weight[prio]; + load->inv_weight = prio_to_wmult[prio]; } static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -- cgit v1.2.3 From 1399fa7807a1a5998bbf147e80668e9950661dfa Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:39 -0700 Subject: sched: Introduce SCHED_POWER_SCALE to scale cpu_power calculations SCHED_LOAD_SCALE is used to increase nice resolution and to scale cpu_power calculations in the scheduler. This patch introduces SCHED_POWER_SCALE and converts all uses of SCHED_LOAD_SCALE for scaling cpu_power to use SCHED_POWER_SCALE instead. This is a preparatory patch for increasing the resolution of SCHED_LOAD_SCALE, and there is no need to increase resolution for cpu_power calculations. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-3-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d036048b59a4..375e9c677d58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6530,7 +6530,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_LOAD_SCALE) { + if (group->cpu_power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", group->cpu_power); } @@ -7905,7 +7905,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_LOAD_SCALE; + rq->cpu_power = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; -- cgit v1.2.3 From c8b281161dfa4bb5d5be63fb036ce19347b88c63 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 14:37:48 -0700 Subject: sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 375e9c677d58..bb504e1839e5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) +#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - tmp = (u64)delta_exec * weight; + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; else - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = WMULT_CONST / w; } /* @@ -1785,12 +1797,12 @@ static void set_load_weight(struct task_struct *p) * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - load->weight = WEIGHT_IDLEPRIO; + load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; } - load->weight = prio_to_weight[prio]; + load->weight = scale_load(prio_to_weight[prio]); load->inv_weight = prio_to_wmult[prio]; } @@ -8809,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), shareval); + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); } static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); - return (u64) tg->shares; + return (u64) scale_load_down(tg->shares); } #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 17d9f311eca13a42bf950198a358be1420d19c5f Mon Sep 17 00:00:00 2001 From: Daniel Hellstrom Date: Fri, 20 May 2011 04:01:10 +0000 Subject: SCHED_TTWU_QUEUE is not longer needed since sparc32 now implements IPI Signed-off-by: Daniel Hellstrom Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: David S. Miller --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8c9d804dc07d..c4b3410d68d3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2568,7 +2568,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); -#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) +#if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { ttwu_queue_remote(p, cpu); return; -- cgit v1.2.3 From f780bdb7c1c73009cb57adcf99ef50027d80bf3c Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Thu, 26 May 2011 16:25:19 -0700 Subject: cgroups: add per-thread subsystem callbacks Add cgroup subsystem callbacks for per-thread attachment in atomic contexts Add can_attach_task(), pre_attach(), and attach_task() as new callbacks for cgroups's subsystem interface. Unlike can_attach and attach, these are for per-thread operations, to be called potentially many times when attaching an entire threadgroup. Also, the old "bool threadgroup" interface is removed, as replaced by this. All subsystems are modified for the new interface - of note is cpuset, which requires from/to nodemasks for attach to be globally scoped (though per-cpuset would work too) to persist from its pre_attach to attach_task and attach. This is a pre-patch for cgroup-procs-writable.patch. Signed-off-by: Ben Blum Cc: "Eric W. Biederman" Cc: Li Zefan Cc: Matt Helsley Reviewed-by: Paul Menage Cc: Oleg Nesterov Cc: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2d12893b8b0f..5e43e9dc65d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8764,42 +8764,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) return 0; } -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } - return 0; -} - static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } static void @@ -8887,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, -- cgit v1.2.3 From d6aa8f85f16379d42c147b22b59e33b67f9ff466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 May 2011 14:21:33 +0200 Subject: sched: Fix ttwu() for __ARCH_WANT_INTERRUPTS_ON_CTXSW Marc reported that e4a52bcb9 (sched: Remove rq->lock from the first half of ttwu()) broke his ARM-SMP machine. Now ARM is one of the few __ARCH_WANT_INTERRUPTS_ON_CTXSW users, so that exception in the ttwu() code was suspect. Yong found that the interrupt could hit after context_switch() changes current but before it clears p->on_cpu, if that interrupt were to attempt a wake-up of p we would indeed find ourselves spinning in IRQ context. Fix this by reverting to the old behaviour for this situation and perform a full remote wake-up. Cc: Frank Rowand Cc: Yong Zhang Cc: Oleg Nesterov Reported-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5e43e9dc65d1..a80ee911900e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) if (!next) smp_send_reschedule(cpu); } -#endif + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_cpu) { + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +#endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) { @@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* - * If called from interrupt context we could have landed in the - * middle of schedule(), in this case we should take care not - * to spin on ->on_cpu if p is current, since that would - * deadlock. + * In case the architecture enables interrupts in + * context_switch(), we cannot busy wait, since that + * would lead to deadlocks when an interrupt hits and + * tries to wake up @prev. So bail and do a complete + * remote wakeup. */ - if (p == current) { - ttwu_queue(p, cpu); + if (ttwu_activate_remote(p, wake_flags)) goto stat; - } -#endif +#else cpu_relax(); +#endif } /* * Pairs with the smp_wmb() in finish_lock_switch(). -- cgit v1.2.3 From 1e1b6c511d1b23cb7c3b619d82fc7bd9f620565d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 19 May 2011 15:08:58 +0900 Subject: cpuset: Fix cpuset_cpus_allowed_fallback(), don't update tsk->rt.nr_cpus_allowed The rule is, we have to update tsk->rt.nr_cpus_allowed if we change tsk->cpus_allowed. Otherwise RT scheduler may confuse. Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4DD4B3FA.5060901@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a80ee911900e..cbb3a0eee58e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5860,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + do_set_cpus_allowed(idle, cpumask_of(cpu)); /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -5948,6 +5948,16 @@ static inline void sched_init_granularity(void) } #ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } +} + /* * This is how migration works: * @@ -5993,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) -- cgit v1.2.3 From f01114cb59d670e9b4f2c335930dd57db96e9360 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 12:26:55 +0200 Subject: sched: Fix cross-cpu clock sync on remote wakeups Markus reported that commit 317f394160e ("sched: Move the second half of ttwu() to the remote cpu") caused some accounting funnies on his AMD Phenom II X4, such as weird 'top' results. It turns out that this is due to non-synced TSC and the queued remote wakeups stopped coupeling the two relevant cpu clocks, which leads to wakeups seeing time jumps, which in turn lead to skewed runtime stats. Add an explicit call to sched_clock_cpu() to couple the per-cpu clocks to restore the normal flow of time. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306835745.2353.3.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cbb3a0eee58e..49cc70b152cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2600,6 +2600,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } -- cgit v1.2.3 From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 10:49:20 +0200 Subject: sched: Fix schedstat.nr_wakeups_migrate While looking over the code I found that with the ttwu rework the nr_wakeups_migrate test broke since we now switch cpus prior to calling ttwu_stat(), hence the test is always true. Cure this by passing the migration state in wake_flags. Also move the whole test under CONFIG_SMP, its hard to migrate tasks on UP :-) Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 49cc70b152cf..2fe98ed474da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2447,6 +2447,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } rcu_read_unlock(); } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); @@ -2455,9 +2459,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (cpu != task_cpu(p)) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SCHEDSTATS */ } @@ -2675,8 +2676,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); + } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); -- cgit v1.2.3 From 6c6c54e1807faf116724451ef2bd14993780470a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jun 2011 17:37:07 +0200 Subject: sched: Fix/clarify set_task_cpu() locking rules Sergey reported a CONFIG_PROVE_RCU warning in push_rt_task where set_task_cpu() was called with both relevant rq->locks held, which should be sufficient for running tasks since holding its rq->lock will serialize against sched_move_task(). Update the comments and fix the task_group() lockdep test. Reported-and-tested-by: Sergey Senozhatsky Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307115427.2353.3456.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2fe98ed474da..3f2e502d609b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() - * holds that lock for each task it moves into the cgroup. Therefore - * by holding that lock, we pin the task to the current cgroup. + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. */ static inline struct task_group *task_group(struct task_struct *p) { @@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock)); + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif -- cgit v1.2.3