summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/fair.c90
-rw-r--r--kernel/sched/sched.h9
3 files changed, 116 insertions, 19 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6b3fff6a6437..82cec9a666e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1162,7 +1162,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
@@ -1183,7 +1184,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
@@ -7355,11 +7355,22 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
+#ifdef CONFIG_SCHED_SMT
+atomic_t sched_smt_present = ATOMIC_INIT(0);
+#endif
+
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going up, increment the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ atomic_inc(&sched_smt_present);
+#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -7408,6 +7419,14 @@ int sched_cpu_deactivate(unsigned int cpu)
else
synchronize_rcu();
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ atomic_dec(&sched_smt_present);
+#endif
+
if (!sched_smp_initialized)
return 0;
@@ -8455,10 +8474,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
@@ -8493,6 +8508,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
@@ -8592,8 +8609,10 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
period = ktime_to_ns(tg->cfs_bandwidth.period);
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
- else
+ else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+ else
+ return -EINVAL;
return tg_set_cfs_bandwidth(tg, period, quota);
}
@@ -8615,6 +8634,9 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
u64 quota, period;
+ if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c91d72f3e8f..5e65c7eea872 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1925,6 +1925,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
*period = LOAD_AVG_MAX;
@@ -2253,13 +2257,23 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2272,8 +2286,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -3862,6 +3882,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4057,6 +4079,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
@@ -4347,12 +4372,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock(&cfs_b->lock);
for (;;) {
@@ -4360,6 +4388,36 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
if (idle)
@@ -6634,10 +6692,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -6647,7 +6705,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
@@ -7879,9 +7937,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -7899,13 +7958,22 @@ out_all_pinned:
sd->nr_balance_failed = 0;
out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * idle_balance() disregards balance intervals, so we could repeatedly
+ * reach this code, which would lead to balance_interval skyrocketting
+ * in a short amount of time. Skip the balance_interval increase logic
+ * to avoid that.
+ */
+ if (env.idle == CPU_NEWLY_IDLE)
+ goto out;
+
/* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
-
- ld_moved = 0;
out:
return ld_moved;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec6e838e991a..819bd5fb0264 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
+#include <linux/sched/smt.h>
#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
#include <linux/kernel_stat.h>
@@ -72,7 +73,13 @@ static inline void update_idle_core(struct rq *rq) { }
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)