summaryrefslogtreecommitdiff
path: root/include/linux/sched.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r--include/linux/sched.h340
1 files changed, 309 insertions, 31 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e27baeeda3f4..485234d2fd42 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,12 +16,14 @@ struct sched_param {
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
+#include <linux/plist.h>
#include <linux/rbtree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/nodemask.h>
#include <linux/mm_types.h>
+#include <linux/preempt_mask.h>
#include <asm/page.h>
#include <asm/ptrace.h>
@@ -55,6 +57,70 @@ struct sched_param {
#include <asm/processor.h>
+#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ * - the activation period or minimum instance inter-arrival time;
+ * - the maximum (or average, depending on the actual scheduling
+ * discipline) computation time of all instances, a.k.a. runtime;
+ * - the deadline (relative to the actual activation time) of each
+ * instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ * @size size of the structure, for fwd/bwd compat.
+ *
+ * @sched_policy task's scheduling policy
+ * @sched_flags for customizing the scheduler behaviour
+ * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
+ * @sched_priority task's static priority (SCHED_FIFO/RR)
+ * @sched_deadline representative of the task's deadline
+ * @sched_runtime representative of the task's runtime
+ * @sched_period representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
+ * only user of this new interface. More information about the algorithm
+ * available in the scheduling class file or in Documentation/.
+ */
+struct sched_attr {
+ u32 size;
+
+ u32 sched_policy;
+ u64 sched_flags;
+
+ /* SCHED_NORMAL, SCHED_BATCH */
+ s32 sched_nice;
+
+ /* SCHED_FIFO, SCHED_RR */
+ u32 sched_priority;
+
+ /* SCHED_DEADLINE */
+ u64 sched_runtime;
+ u64 sched_deadline;
+ u64 sched_period;
+};
+
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@@ -167,7 +233,6 @@ extern char ___assert_task_state[1 - 2*!!(
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
-#define task_is_dead(task) ((task)->exit_state != 0)
#define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \
@@ -285,6 +350,14 @@ static inline void lockup_detector_init(void)
}
#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+void reset_hung_task_detector(void);
+#else
+static inline void reset_hung_task_detector(void)
+{
+}
+#endif
+
/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -322,6 +395,10 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
+#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
+#define SUID_DUMP_USER 1 /* Dump as user of process */
+#define SUID_DUMP_ROOT 2 /* Dump as root */
+
/* mm flags */
/* dumpable bits */
#define MMF_DUMPABLE 0 /* core dump is permitted */
@@ -427,6 +504,12 @@ struct task_cputime {
.sum_exec_runtime = 0, \
}
+#ifdef CONFIG_PREEMPT_COUNT
+#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
+#else
+#define PREEMPT_DISABLED PREEMPT_ENABLED
+#endif
+
/*
* Disable preemption until the scheduler is running.
* Reset by start_kernel()->sched_init()->init_idle().
@@ -434,7 +517,7 @@ struct task_cputime {
* We include PREEMPT_ACTIVE to avoid cond_resched() from working
* before the scheduler is active -- see should_resched().
*/
-#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
+#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
/**
* struct thread_group_cputimer - thread group interval timer counts
@@ -466,6 +549,7 @@ struct signal_struct {
atomic_t sigcnt;
atomic_t live;
int nr_threads;
+ struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
@@ -768,6 +852,7 @@ enum cpu_idle_type {
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
+#define SD_NUMA 0x4000 /* cross-node balancing */
extern int __weak arch_sd_sibiling_asym_packing(void);
@@ -809,7 +894,9 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
- u64 last_update;
+ /* idle_balance() stats */
+ u64 max_newidle_lb_cost;
+ unsigned long next_decay_max_lb_cost;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
@@ -908,7 +995,8 @@ struct pipe_inode_info;
struct uts_namespace;
struct load_weight {
- unsigned long weight, inv_weight;
+ unsigned long weight;
+ u32 inv_weight;
};
struct sched_avg {
@@ -1006,6 +1094,51 @@ struct sched_rt_entity {
#endif
};
+struct sched_dl_entity {
+ struct rb_node rb_node;
+
+ /*
+ * Original scheduling parameters. Copied here from sched_attr
+ * during sched_setscheduler2(), they will remain the same until
+ * the next sched_setscheduler2().
+ */
+ u64 dl_runtime; /* maximum runtime for each instance */
+ u64 dl_deadline; /* relative deadline of each instance */
+ u64 dl_period; /* separation of two instances (period) */
+ u64 dl_bw; /* dl_runtime / dl_deadline */
+
+ /*
+ * Actual scheduling parameters. Initialized with the values above,
+ * they are continously updated during task execution. Note that
+ * the remaining runtime could be < 0 in case we are in overrun.
+ */
+ s64 runtime; /* remaining runtime for this instance */
+ u64 deadline; /* absolute deadline for this instance */
+ unsigned int flags; /* specifying the scheduler behaviour */
+
+ /*
+ * Some bool flags:
+ *
+ * @dl_throttled tells if we exhausted the runtime. If so, the
+ * task has to wait for a replenishment to be performed at the
+ * next firing of dl_timer.
+ *
+ * @dl_new tells if a new instance arrived. If so we must
+ * start executing it with full runtime and reset its absolute
+ * deadline;
+ *
+ * @dl_boosted tells if we are boosted due to DI. If so we are
+ * outside bandwidth enforcement mechanism (but only until we
+ * exit the critical section).
+ */
+ int dl_throttled, dl_new, dl_boosted;
+
+ /*
+ * Bandwidth enforcement timer. Each -deadline task has its
+ * own bandwidth to be enforced, thus we need one timer per task.
+ */
+ struct hrtimer dl_timer;
+};
struct rcu_node;
@@ -1029,6 +1162,8 @@ struct task_struct {
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts;
+
+ int wake_cpu;
#endif
int on_rq;
@@ -1040,21 +1175,13 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
+ struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
- /*
- * fpu_counter contains the number of consecutive context switches
- * that the FPU is used. If this is over a threshold, the lazy fpu
- * saving becomes unlazy to save the trap. This is an unsigned char
- * so that after 256 times the counter wraps and the behavior turns
- * lazy again; this to deal with bursty apps that only use FPU for
- * a short time
- */
- unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
@@ -1082,6 +1209,7 @@ struct task_struct {
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
+ struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm, *active_mm;
@@ -1144,6 +1272,7 @@ struct task_struct {
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
+ struct list_head thread_node;
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
@@ -1233,9 +1362,12 @@ struct task_struct {
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
- struct plist_head pi_waiters;
+ struct rb_root pi_waiters;
+ struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
+ /* Top pi_waiters task */
+ struct task_struct *pi_top_task;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1324,10 +1456,41 @@ struct task_struct {
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
- int numa_migrate_seq;
unsigned int numa_scan_period;
+ unsigned int numa_scan_period_max;
+ int numa_preferred_nid;
+ int numa_migrate_deferred;
+ unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
+
+ struct list_head numa_entry;
+ struct numa_group *numa_group;
+
+ /*
+ * Exponential decaying average of faults on a per-node basis.
+ * Scheduling placement decisions are made based on the these counts.
+ * The values remain static for the duration of a PTE scan
+ */
+ unsigned long *numa_faults;
+ unsigned long total_numa_faults;
+
+ /*
+ * numa_faults_buffer records faults per node during the current
+ * scan window. When the scan completes, the counts in numa_faults
+ * decay and these values are copied.
+ */
+ unsigned long *numa_faults_buffer;
+
+ /*
+ * numa_faults_locality tracks if faults recorded during the last
+ * scan window were remote/local. The task scan period is adapted
+ * based on the locality of the faults with different weights
+ * depending on whether they were shared or private faults
+ */
+ unsigned long numa_faults_locality[2];
+
+ unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
struct rcu_head rcu;
@@ -1412,16 +1575,33 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#define TNF_MIGRATED 0x01
+#define TNF_NO_GROUP 0x02
+#define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL 0x08
+
#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages, bool migrated);
+extern void task_numa_fault(int last_node, int node, int pages, int flags);
+extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
+extern void task_numa_free(struct task_struct *p);
+
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
#else
-static inline void task_numa_fault(int node, int pages, bool migrated)
+static inline void task_numa_fault(int last_node, int node, int pages,
+ int flags)
{
}
+static inline pid_t task_numa_group_id(struct task_struct *p)
+{
+ return 0;
+}
static inline void set_numabalancing_state(bool enabled)
{
}
+static inline void task_numa_free(struct task_struct *p)
+{
+}
#endif
static inline struct pid *task_pid(struct task_struct *task)
@@ -1816,7 +1996,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
* but then during bootup it turns out that sched_clock()
* is reliable after all:
*/
-extern int sched_clock_stable;
+extern int sched_clock_stable(void);
+extern void set_sched_clock_stable(void);
+extern void clear_sched_clock_stable(void);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
@@ -1895,6 +2077,8 @@ extern int sched_setscheduler(struct task_struct *, int,
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
const struct sched_param *);
+extern int sched_setattr(struct task_struct *,
+ const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
@@ -1974,7 +2158,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
-extern void sched_fork(struct task_struct *p);
+extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_dead(struct task_struct *p);
extern void proc_caches_init(void);
@@ -2159,6 +2343,16 @@ extern bool current_is_single_threaded(void);
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
+#define __for_each_thread(signal, t) \
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t) \
+ __for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t) \
+ for_each_process(p) for_each_thread(p, t)
+
static inline int get_nr_threads(struct task_struct *tsk)
{
return tsk->signal->nr_threads;
@@ -2401,11 +2595,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}
-static inline int need_resched(void)
-{
- return unlikely(test_thread_flag(TIF_NEED_RESCHED));
-}
-
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -2474,36 +2663,120 @@ static inline int tsk_is_polling(struct task_struct *p)
{
return task_thread_info(p)->status & TS_POLLING;
}
-static inline void current_set_polling(void)
+static inline void __current_set_polling(void)
{
current_thread_info()->status |= TS_POLLING;
}
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ __current_set_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb();
+
+ return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
{
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb();
+
+ return unlikely(tif_need_resched());
}
#elif defined(TIF_POLLING_NRFLAG)
static inline int tsk_is_polling(struct task_struct *p)
{
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
}
-static inline void current_set_polling(void)
+
+static inline void __current_set_polling(void)
{
set_thread_flag(TIF_POLLING_NRFLAG);
}
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ __current_set_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ *
+ * XXX: assumes set/clear bit are identical barrier wise.
+ */
+ smp_mb__after_clear_bit();
+
+ return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
{
clear_thread_flag(TIF_POLLING_NRFLAG);
}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb__after_clear_bit();
+
+ return unlikely(tif_need_resched());
+}
+
#else
static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void current_set_polling(void) { }
-static inline void current_clr_polling(void) { }
+static inline void __current_set_polling(void) { }
+static inline void __current_clr_polling(void) { }
+
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ return unlikely(tif_need_resched());
+}
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ return unlikely(tif_need_resched());
+}
#endif
+static inline void current_clr_polling(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+ * Once the bit is cleared, we'll get IPIs with every new
+ * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+ * fold.
+ */
+ smp_mb(); /* paired with resched_task() */
+
+ preempt_fold_need_resched();
+}
+
+static __always_inline bool need_resched(void)
+{
+ return unlikely(tif_need_resched());
+}
+
/*
* Thread group CPU time accounting.
*/
@@ -2545,6 +2818,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
return task_thread_info(p)->cpu;
}
+static inline int task_node(const struct task_struct *p)
+{
+ return cpu_to_node(task_cpu(p));
+}
+
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
#else