From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 22 Apr 2011 11:19:10 -0600
Subject: sched: Get rid of lock_depth

Neil Brown pointed out that lock_depth somehow escaped the BKL
removal work.  Let's get rid of it now.

Note that the perf scripting utilities still have a bunch of
code for dealing with common_lock_depth in tracepoints; I have
left that in place in case anybody wants to use that code with
older kernels.

Suggested-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..aca62871a4f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	posix_cpu_timers_init(p);
 
-	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
-- 
cgit v1.2.3


From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001
From: Samir Bellabes <sam@synack.fr>
Date: Wed, 11 May 2011 18:18:05 +0200
Subject: sched: Remove unused parameters from sched_fork() and
 wake_up_new_task()

sched_fork() and wake_up_new_task() are defined with a parameter
'unsigned long clone_flags', which is unused.

This patch removes the parameters.

Signed-off-by: Samir Bellabes <sam@synack.fr>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index aca62871a4f9..2b44d82b8237 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1152,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
-	sched_fork(p, clone_flags);
+	sched_fork(p);
 
 	retval = perf_event_init_task(p);
 	if (retval)
@@ -1463,7 +1463,7 @@ long do_fork(unsigned long clone_flags,
 		 */
 		p->flags &= ~PF_STARTING;
 
-		wake_up_new_task(p, clone_flags);
+		wake_up_new_task(p);
 
 		tracehook_report_clone_complete(trace, regs,
 						clone_flags, nr, p);
-- 
cgit v1.2.3


From 97a894136f29802da19a15541de3c019e1ca147e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 24 May 2011 17:12:04 -0700
Subject: mm: Remove i_mmap_lock lockbreak

Hugh says:
 "The only significant loser, I think, would be page reclaim (when
  concurrent with truncation): could spin for a long time waiting for
  the i_mmap_mutex it expects would soon be dropped? "

Counter points:
 - cpu contention makes the spin stop (need_resched())
 - zap pages should be freeing pages at a higher rate than reclaim
   ever can

I think the simplification of the truncate code is definitely worth it.

Effectively reverts: 2aa15890f3c ("mm: prevent concurrent
unmap_mapping_range() on the same inode") and takes out the code that
caused its problem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2b44d82b8237..4eef925477fc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -386,7 +386,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			spin_lock(&mapping->i_mmap_lock);
 			if (tmp->vm_flags & VM_SHARED)
 				mapping->i_mmap_writable++;
-			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_prio_tree_add(tmp, mpnt);
-- 
cgit v1.2.3


From 3d48ae45e72390ddf8cc5256ac32ed6f7a19cbea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 24 May 2011 17:12:06 -0700
Subject: mm: Convert i_mmap_lock to a mutex

Straightforward conversion of i_mmap_lock to a mutex.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Tony Luck <tony.luck@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4eef925477fc..927692734bcf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -383,14 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
-			spin_lock(&mapping->i_mmap_lock);
+			mutex_lock(&mapping->i_mmap_mutex);
 			if (tmp->vm_flags & VM_SHARED)
 				mapping->i_mmap_writable++;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
 			vma_prio_tree_add(tmp, mpnt);
 			flush_dcache_mmap_unlock(mapping);
-			spin_unlock(&mapping->i_mmap_lock);
+			mutex_unlock(&mapping->i_mmap_mutex);
 		}
 
 		/*
-- 
cgit v1.2.3


From de03c72cfce5b263a674d04348b58475ec50163c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 24 May 2011 17:12:15 -0700
Subject: mm: convert mm->cpu_vm_cpumask into cpumask_var_t

cpumask_t is very big struct and cpu_vm_mask is placed wrong position.
It might lead to reduce cache hit ratio.

This patch has two change.
1) Move the place of cpumask into last of mm_struct. Because usually cpumask
   is accessed only front bits when the system has cpu-hotplug capability
2) Convert cpu_vm_mask into cpumask_var_t. It may help to reduce memory
   footprint if cpumask_size() will use nr_cpumask_bits properly in future.

In addition, this patch change the name of cpu_vm_mask with cpu_vm_mask_var.
It may help to detect out of tree cpu_vm_mask users.

This patch has no functional change.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 927692734bcf..8e7e135d0817 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -485,6 +485,20 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (oldmm)
+		cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
+	else
+		memset(mm_cpumask(mm), 0, cpumask_size());
+#endif
+	return 0;
+}
+
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -521,10 +535,20 @@ struct mm_struct * mm_alloc(void)
 	struct mm_struct * mm;
 
 	mm = allocate_mm();
-	if (mm) {
-		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+	if (!mm)
+		return NULL;
+
+	memset(mm, 0, sizeof(*mm));
+	mm = mm_init(mm, current);
+	if (!mm)
+		return NULL;
+
+	if (mm_init_cpumask(mm, NULL)) {
+		mm_free_pgd(mm);
+		free_mm(mm);
+		return NULL;
 	}
+
 	return mm;
 }
 
@@ -536,6 +560,7 @@ struct mm_struct * mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
+	free_cpumask_var(mm->cpu_vm_mask_var);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
@@ -690,6 +715,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
+	if (mm_init_cpumask(mm, oldmm))
+		goto fail_nocpumask;
+
 	if (init_new_context(tsk, mm))
 		goto fail_nocontext;
 
@@ -716,6 +744,9 @@ fail_nomem:
 	return NULL;
 
 fail_nocontext:
+	free_cpumask_var(mm->cpu_vm_mask_var);
+
+fail_nocpumask:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
-- 
cgit v1.2.3


From 4714d1d32d97239fb5ae3e10521d3f133a899b66 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Thu, 26 May 2011 16:25:18 -0700
Subject: cgroups: read-write lock CLONE_THREAD forking per threadgroup

Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup

Add an rwsem that lives in a threadgroup's signal_struct that's taken for
reading in the fork path, under CONFIG_CGROUPS.  If another part of the
kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other
system would both depend on.

This is a pre-patch for cgroup-procs-write.patch.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8e7e135d0817..1fa9d940e301 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -957,6 +957,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
+#ifdef CONFIG_CGROUPS
+	init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1138,6 +1142,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	monotonic_to_bootbased(&p->real_start_time);
 	p->io_context = NULL;
 	p->audit_context = NULL;
+	if (clone_flags & CLONE_THREAD)
+		threadgroup_fork_read_lock(current);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1342,6 +1348,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
+	if (clone_flags & CLONE_THREAD)
+		threadgroup_fork_read_unlock(current);
 	perf_event_fork(p);
 	return p;
 
@@ -1380,6 +1388,8 @@ bad_fork_cleanup_policy:
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
+	if (clone_flags & CLONE_THREAD)
+		threadgroup_fork_read_unlock(current);
 	cgroup_exit(p, cgroup_callbacks_done);
 	delayacct_tsk_free(p);
 	module_put(task_thread_info(p)->exec_domain->module);
-- 
cgit v1.2.3


From a77aea92010acf54ad785047234418d5d68772e2 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@free.fr>
Date: Thu, 26 May 2011 16:25:23 -0700
Subject: cgroup: remove the ns_cgroup

The ns_cgroup is an annoying cgroup at the namespace / cgroup frontier and
leads to some problems:

  * cgroup creation is out-of-control
  * cgroup name can conflict when pids are looping
  * it is not possible to have a single process handling a lot of
    namespaces without falling in a exponential creation time
  * we may want to create a namespace without creating a cgroup

  The ns_cgroup was replaced by a compatibility flag 'clone_children',
  where a newly created cgroup will copy the parent cgroup values.
  The userspace has to manually create a cgroup and add a task to
  the 'tasks' file.

This patch removes the ns_cgroup as suggested in the following thread:

https://lists.linux-foundation.org/pipermail/containers/2009-June/018616.html

The 'cgroup_clone' function is removed because it is no longer used.

This is a userspace-visible change.  Commit 45531757b45c ("cgroup: notify
ns_cgroup deprecated") (merged into 2.6.27) caused the kernel to emit a
printk warning users that the feature is planned for removal.  Since that
time we have heard from XXX users who were affected by this.

Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jamal Hadi Salim <hadi@cyberus.ca>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1fa9d940e301..1f84099ecce6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1229,12 +1229,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
 
-	if (current->nsproxy != p->nsproxy) {
-		retval = ns_cgroup_clone(p, pid);
-		if (retval)
-			goto bad_fork_free_pid;
-	}
-
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
-- 
cgit v1.2.3


From 3864601387cf4196371e3c1897fdffa5228296f9 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 26 May 2011 16:25:46 -0700
Subject: mm: extract exe_file handling from procfs

Setup and cleanup of mm_struct->exe_file is currently done in fs/proc/.
This was because exe_file was needed only for /proc/<pid>/exe.  Since we
will need the exe_file functionality also for core dumps (so core name can
contain full binary path), built this functionality always into the
kernel.

To achieve that move that out of proc FS to the kernel/ where in fact it
should belong.  By doing that we can make dup_mm_exe_file static.  Also we
can drop linux/proc_fs.h inclusion in fs/exec.c and kernel/fork.c.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1f84099ecce6..ca406d916713 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,7 +59,6 @@
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
@@ -597,6 +596,57 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+	mm->num_exe_file_vmas--;
+	if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+		fput(mm->exe_file);
+		mm->exe_file = NULL;
+	}
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+	if (new_exe_file)
+		get_file(new_exe_file);
+	if (mm->exe_file)
+		fput(mm->exe_file);
+	mm->exe_file = new_exe_file;
+	mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+	struct file *exe_file;
+
+	/* We need mmap_sem to protect against races with removal of
+	 * VM_EXECUTABLE vmas */
+	down_read(&mm->mmap_sem);
+	exe_file = mm->exe_file;
+	if (exe_file)
+		get_file(exe_file);
+	up_read(&mm->mmap_sem);
+	return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+	/* It's safe to write the exe_file pointer without exe_file_lock because
+	 * this is called during fork when the task is not yet in /proc */
+	newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
-- 
cgit v1.2.3


From 6345d24daf0c1fffe6642081d783cdf653ebaa5c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 May 2011 11:32:28 -0700
Subject: mm: Fix boot crash in mm_alloc()

Thomas Gleixner reports that we now have a boot crash triggered by
CONFIG_CPUMASK_OFFSTACK=y:

    BUG: unable to handle kernel NULL pointer dereference at   (null)
    IP: [<c11ae035>] find_next_bit+0x55/0xb0
    Call Trace:
     [<c11addda>] cpumask_any_but+0x2a/0x70
     [<c102396b>] flush_tlb_mm+0x2b/0x80
     [<c1022705>] pud_populate+0x35/0x50
     [<c10227ba>] pgd_alloc+0x9a/0xf0
     [<c103a3fc>] mm_init+0xec/0x120
     [<c103a7a3>] mm_alloc+0x53/0xd0

which was introduced by commit de03c72cfce5 ("mm: convert
mm->cpu_vm_cpumask into cpumask_var_t"), and is due to wrong ordering of
mm_init() vs mm_init_cpumask

Thomas wrote a patch to just fix the ordering of initialization, but I
hate the new double allocation in the fork path, so I ended up instead
doing some more radical surgery to clean it all up.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 42 ++++++++++--------------------------------
 1 file changed, 10 insertions(+), 32 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index ca406d916713..0276c30401a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (oldmm)
-		cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
-	else
-		memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
-	return 0;
-}
-
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
-	mm = mm_init(mm, current);
-	if (!mm)
-		return NULL;
-
-	if (mm_init_cpumask(mm, NULL)) {
-		mm_free_pgd(mm);
-		free_mm(mm);
-		return NULL;
-	}
-
-	return mm;
+	mm_init_cpumask(mm);
+	return mm_init(mm, current);
 }
 
 /*
@@ -559,7 +536,6 @@ struct mm_struct * mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
-	free_cpumask_var(mm->cpu_vm_mask_var);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
@@ -753,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 		goto fail_nomem;
 
 	memcpy(mm, oldmm, sizeof(*mm));
+	mm_init_cpumask(mm);
 
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -765,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-	if (mm_init_cpumask(mm, oldmm))
-		goto fail_nocpumask;
-
 	if (init_new_context(tsk, mm))
 		goto fail_nocontext;
 
@@ -794,9 +768,6 @@ fail_nomem:
 	return NULL;
 
 fail_nocontext:
-	free_cpumask_var(mm->cpu_vm_mask_var);
-
-fail_nocpumask:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
@@ -1591,6 +1562,13 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+	/*
+	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
+	 * whole struct cpumask for the OFFSTACK case. We could change
+	 * this to *only* allocate as much of it as required by the
+	 * maximum number of CPU's we can ever have.  The cpumask_allocation
+	 * is at the end of the structure, exactly for that reason.
+	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-- 
cgit v1.2.3