From 20ebcdda78a282d1d5266887ddf8a2d670182576 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 17 Jun 2009 16:27:16 -0700 Subject: memcg: remove unneeded forward declaration from sched.h This forward declaration seems pointless. Signed-off-by: Li Zefan Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 02042e7f2196..d0342101756a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -92,7 +92,6 @@ struct sched_param { #include -struct mem_cgroup; struct exec_domain; struct futex_pi_state; struct robust_list_head; -- cgit v1.2.3 From 17f98dcf6010a1cfd25d179fd0ce77d3dc2685c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2009 16:27:51 -0700 Subject: pids: clean up find_task_by_pid variants find_task_by_pid_type_ns is only used to implement find_task_by_vpid and find_task_by_pid_ns, but both of them pass PIDTYPE_PID as first argument. So just fold find_task_by_pid_type_ns into find_task_by_pid_ns and use find_task_by_pid_ns to implement find_task_by_vpid. While we're at it also remove the exports for find_task_by_pid_ns and find_task_by_vpid - we don't have any modular callers left as the only modular caller of he old pre pid namespace find_task_by_pid (gfs2) was switched to pid_task which operates on a struct pid pointer instead of a pid_t. Given the confusion about pid_t values vs namespace that's generally the better option anyway and I think we're better of restricting modules to do it that way. Signed-off-by: Christoph Hellwig Cc: Pavel Emelyanov Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d0342101756a..4d0754269884 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1878,9 +1878,6 @@ extern struct pid_namespace init_pid_ns; /* * find a task by one of its numerical ids * - * find_task_by_pid_type_ns(): - * it is the most generic call - it finds a task by all id, - * type and namespace specified * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): @@ -1889,9 +1886,6 @@ extern struct pid_namespace init_pid_ns; * see also find_vpid() etc in include/linux/pid.h */ -extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, - struct pid_namespace *ns); - extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -- cgit v1.2.3 From 341c87bf346f57748230628c5ad6ee69219250e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 30 Jun 2009 11:41:23 -0700 Subject: elf: limit max map count to safe value With ELF, at generating coredump, some more headers other than used vmas are added. When max_map_count == 65536, a core generated by following kinds of code can be unreadable because the number of ELF's program header is written in 16bit in Ehdr (please see elf.h) and the number overflows. == ... = mmap(); (munmap, mprotect, etc...) if (failed) abort(); == This can happen in mmap/munmap/mprotect/etc...which calls split_vma(). I think 65536 is not safe as _default_ and reduce it to 65530 is good for avoiding unexpected corrupted core. Anyway, max_map_count can be enlarged by sysctl if a user is brave.. Signed-off-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Jakub Jelinek Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d0754269884..0085d758d645 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -349,8 +349,20 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); struct nsproxy; struct user_namespace; -/* Maximum number of active map areas.. This is a random (large) number */ -#define DEFAULT_MAX_MAP_COUNT 65536 +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHORT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; -- cgit v1.2.3 From c99e6efe1ba04561e7d93a81f0be07e37427e835 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:56 +0200 Subject: sched: INIT_PREEMPT_COUNT Pull the initial preempt_count value into a single definition site. Maintainers for: alpha, ia64 and m68k, please have a look, your arch code is funny. The header magic is a bit odd, but similar to the KERNEL_DS one, CPP waits with expanding these macros until the INIT_THREAD_INFO macro itself is expanded, which is in arch/*/kernel/init_task.c where we've already included sched.h so we're good. Cc: tony.luck@intel.com Cc: rth@twiddle.net Cc: geert@linux-m68k.org Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0085d758d645..2a99f1c15cf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -498,6 +498,12 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * Disable preemption until the scheduler is running. + * Reset by start_kernel()->sched_init()->init_idle(). + */ +#define INIT_PREEMPT_COUNT (1) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime: thread group interval timers. -- cgit v1.2.3 From d86ee4809d0329d4aa0d0f2c76c2295a16862799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:57 +0200 Subject: sched: optimize cond_resched() Optimize cond_resched() by removing one conditional. Currently cond_resched() checks system_state == SYSTEM_RUNNING in order to avoid scheduling before the scheduler is running. We can however, as per suggestion of Matt, use PREEMPT_ACTIVE to accomplish that very same. Suggested-by: Matt Mackall Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a99f1c15cf8..16a982e389fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -501,8 +501,11 @@ struct task_cputime { /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). + * + * We include PREEMPT_ACTIVE to avoid cond_resched() from working + * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1) +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts -- cgit v1.2.3 From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 16a982e389fb..3ab08e4bb6b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -209,7 +209,7 @@ extern unsigned long long time_sync_thresh; ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FREEZING) == 0) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -1680,6 +1680,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ -- cgit v1.2.3 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..0f1ea4a66957 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1198,6 +1198,7 @@ struct task_struct { * a short time */ unsigned char fpu_counter; + s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif -- cgit v1.2.3